Skip to content

Commit

Permalink
improve packages search query (#102)
Browse files Browse the repository at this point in the history
  • Loading branch information
garbas committed Jun 18, 2020
1 parent 0fb5f69 commit c420d05
Show file tree
Hide file tree
Showing 8 changed files with 456 additions and 155 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ dist
package-lock.json
result
scripts/eval-*
eval-*
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,24 @@ For backend we are using Elasticsearch instance which is kindly sponsored by
[Elm](https://elm-lang.org).


## How search works?

The use case we want to solve is that a visitor want to see if a package
exists or to look up certain package's details.

A user wants to converge to a single result if possible. The more characters
are added to a search query the more narrow is search is and we should show
less results.

Very important is also ranking of search results. This will bring more relevant
search results to the top, since a lot of times it is hard to produce search
query that will output only one result item.

A less important, but providing better user experience. are suggestions for
writing better search query. Suggesting feature should guide user to write
better queries which in turn will produce better results.


## Ideas we want to explore

Apart from searching packages and options we would like to:
Expand Down
2 changes: 1 addition & 1 deletion elm.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"elm/html": "1.0.0",
"elm/http": "2.0.0",
"elm/json": "1.1.3",
"elm/regex": "1.0.0",
"elm/url": "1.0.0",
"hecrj/html-parser": "2.3.4",
"krisajenkins/remotedata": "6.0.1",
Expand All @@ -21,7 +22,6 @@
"elm/bytes": "1.0.8",
"elm/file": "1.0.5",
"elm/parser": "1.1.0",
"elm/regex": "1.0.0",
"elm/time": "1.0.0",
"elm/virtual-dom": "1.0.2",
"rtfeldman/elm-hex": "1.0.0"
Expand Down
176 changes: 149 additions & 27 deletions scripts/import-channel
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import boto3
import botocore
import botocore.client
import xml.etree.ElementTree
import click
import click_log
import elasticsearch
Expand All @@ -22,18 +21,20 @@ import json
import logging
import os.path
import pypandoc
import re
import requests
import shlex
import subprocess
import tqdm
import xml.etree.ElementTree

logger = logging.getLogger("import-channel")
click_log.basic_config(logger)


S3_BUCKET = "nix-releases"
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
INDEX_SCHEMA_VERSION = 5
INDEX_SCHEMA_VERSION = 6
CHANNELS = {
"unstable": {
"packages": "nixpkgs/nixpkgs-20.09pre",
Expand All @@ -49,11 +50,18 @@ CHANNELS = {
},
}
ANALYSIS = {
"normalizer": {
"lowercase": {
"type": "custom",
"char_filter": [],
"filter": ["lowercase"],
}
},
"analyzer": {
"nixAttrName": {
"lowercase": {
"type": "custom",
"tokenizer": "nix_attrname",
"filter": ["lowercase", "nix_stopwords"],
"tokenizer": "keyword",
"filter": ["lowercase"],
},
"nixOptionName": {
"type": "custom",
Expand All @@ -67,13 +75,24 @@ ANALYSIS = {
},
},
"tokenizer": {
"nix_attrname": {
"nix_package_query": {
"type": "pattern",
"pattern": "|".join(
[
"[ ]",
]
),
},
"nix_package_attr_name": {
"type": "pattern",
# Split on attrname separators like _, .
"pattern": "|".join(
[
"[_.-]", # Common separators like underscores, dots and dashes
"\\d+?Packages", # python37Packages -> python
"\\d+?Plugins", # vimPlugins -> vim
"\\d+?Extensions", # php74Extensions -> php
"\\d+?Interpreters", # perlInterpreters -> perl
# Camelcase tokenizer adapted from
# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html
"".join(
Expand Down Expand Up @@ -118,7 +137,18 @@ ANALYSIS = {
"nix_stopwords": {
"type": "stop",
"ignore_case": True,
"stopwords": ["packages", "package", "options", "option"],
"stopwords": [
"packages",
"package",
"options",
"option",
"plugins",
"plugin",
"extensions",
"extension",
"interpreters",
"interpreter",
],
},
},
}
Expand Down Expand Up @@ -146,12 +176,21 @@ MAPPING = {
},
},
"package_attr_name": {
"type": "text",
"analyzer": "nixAttrName",
"fields": {"raw": {"type": "keyword"}},
"type": "keyword",
"normalizer": "lowercase",
},
"package_attr_name_query": {
"type": "keyword",
"normalizer": "lowercase",
},
"package_attr_set": {
"type": "keyword",
"normalizer": "lowercase",
},
"package_pname": {
"type": "keyword",
"normalizer": "lowercase",
},
"package_attr_set": {"type": "keyword"},
"package_pname": {"type": "keyword"},
"package_pversion": {"type": "keyword"},
"package_description": {"type": "text"},
"package_longDescription": {"type": "text"},
Expand Down Expand Up @@ -195,6 +234,39 @@ MAPPING = {
}


def split_query(text):
"""Tokenize package attr_name
Example:
python37Packages.test_name-test
= index: 0
- python37Packages.test1_name-test2
- python37Packages.test1_name
- python37Packages.test1
- python37
- python
= index: 1
- test1_name-test2
- test1_name
- test1
= index: 2
- name-test2
- name
= index: 3
- test2
"""
tokens = []
regex = re.compile(".+?(?:(?<=[a-z])(?=[1-9A-Z])|(?<=[1-9A-Z])(?=[A-Z][a-z])|[\._-]|$)")
parts = [m.group(0) for m in regex.finditer(text)]
for index in range(len(parts)):
prev_parts = ""
for part in parts[index:]:
tokens.append((prev_parts + part).rstrip("_.-"))
prev_parts += part
return tokens


def get_last_evaluation(prefix):
logger.debug(f"Retriving last evaluation for {prefix} prefix.")

Expand Down Expand Up @@ -265,6 +337,63 @@ def get_evaluation_builds(evaluation_id):
return result


def get_maintainer(maintainer):
maintainers = []

if type(maintainer) == str:
maintainers.append(dict(
name=maintainer,
email=None,
github=None,
))

elif type(maintainer) == dict:
maintainers.append(dict(
name=maintainer.get("name"),
email=maintainer.get("email"),
github=maintainer.get("github"),
))

elif type(maintainer) == list:
for item in maintainer:
maintainers += get_maintainer(item)

else:
logger.error(f"maintainer can not be recognized from: {maintainer}")
sys.exit(1)

return maintainers


def remove_attr_set(name):
# some package sets the prefix is included in pname
sets = [
# Packages
"emscripten",
"lua",
"php",
"pure",
"python",
"lisp",
"perl",
"ruby",
# Plugins
"elasticsearch",
"graylog",
"tmuxplugin"
"vimplugin"
]
# TODO: is this correct
if any([name.startswith(i) for i in sets]):
name = "-".join(name.split("-")[1:])

# node does things a bit different
elif name.startswith("node_"):
name = name[len("node_"):]

return name


def get_packages(evaluation, evaluation_builds):
logger.debug(
f"get_packages: Retriving list of packages for '{evaluation['git_revision']}' revision"
Expand All @@ -281,6 +410,7 @@ def get_packages(evaluation, evaluation_builds):

def gen():
for attr_name, data in packages:

position = data["meta"].get("position")
if position and position.startswith("/nix/store"):
position = position[44:]
Expand All @@ -300,16 +430,7 @@ def get_packages(evaluation, evaluation_builds):
else:
licenses = []

maintainers = [
type(maintainer) == str
and dict(name=maintainer, email=None, github=None)
or dict(
name=maintainer.get("name"),
email=maintainer.get("email"),
github=maintainer.get("github"),
)
for maintainer in data["meta"].get("maintainers", [])
]
maintainers = get_maintainer(data["meta"].get("maintainers", []))

platforms = [
type(platform) == str and platform or None
Expand All @@ -319,9 +440,9 @@ def get_packages(evaluation, evaluation_builds):
attr_set = None
if "." in attr_name:
attr_set = attr_name.split(".")[0]
if not attr_set.endswith("Packages") and not attr_set.endswith(
"Plugins"
):
if not attr_set.endswith("Packages") and \
not attr_set.endswith("Plugins") and \
not attr_set.endswith("Extensions"):
attr_set = None

hydra = None
Expand Down Expand Up @@ -349,8 +470,9 @@ def get_packages(evaluation, evaluation_builds):
type="package",
package_hydra=hydra,
package_attr_name=attr_name,
package_attr_name_query=list(split_query(attr_name)),
package_attr_set=attr_set,
package_pname=data["pname"],
package_pname=remove_attr_set(data["pname"]),
package_pversion=data["version"],
package_description=data["meta"].get("description"),
package_longDescription=data["meta"].get("longDescription", ""),
Expand Down Expand Up @@ -405,7 +527,7 @@ def get_options(evaluation):
# we first check if there are some xml elements before using pypandoc
# since pypandoc calls are quite slow
root = xml.etree.ElementTree.fromstring(xml_description)
if len(root.find('para').getchildren()) > 0:
if len(list(root.find('para'))) > 0:
description = pypandoc.convert_text(
xml_description,
"html",
Expand Down
38 changes: 34 additions & 4 deletions scripts/packages-config.nix
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,38 @@
# Enable recursion into attribute sets that nix-env normally doesn't look into
# so that we can get a more complete picture of the available packages for the
# purposes of the index.
packageOverrides = super: {
haskellPackages = super.recurseIntoAttrs super.haskellPackages;
rPackages = super.recurseIntoAttrs super.rPackages;
};
packageOverrides = super:
let
recurseIntoAttrs = sets:
super.lib.genAttrs
(builtins.filter (set: builtins.hasAttr set super) sets)
(set: super.recurseIntoAttrs (builtins.getAttr set super));
in recurseIntoAttrs [
"roundcubePlugins"
"emscriptenfastcompPackages"
"fdbPackages"
"nodePackages_latest"
"nodePackages"
"platformioPackages"
"haskellPackages"
"idrisPackages"
"sconsPackages"
"gns3Packages"
"quicklispPackagesClisp"
"quicklispPackagesSBCL"
"rPackages"
"apacheHttpdPackages_2_4"
"zabbix44"
"zabbix40"
"zabbix30"
"fusePackages"
"nvidiaPackages"
"sourceHanPackages"
"atomPackages"
"emacs25Packages"
"emacs26Packages"
"steamPackages"
"ut2004Packages"
"zeroadPackages"
];
}
Loading

1 comment on commit c420d05

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.