<h2> Testing search visualization <h2>

<h3> Search Pipeline <h3>

In [None]:
# imports
from pathlib import Path
from sciencesearch.nlp.hyper import Hyper, algorithms_from_results
from sciencesearch.nlp.sweep import Sweep
from sciencesearch.nlp.models import Rake, Yake, KPMiner, Ensemble
from sciencesearch.nlp.train import train_hyper, load_hyper, run_hyper
from sciencesearch.nlp.search import Searcher
from operator import attrgetter
# logging
import logging
logging.root.setLevel(logging.ERROR)  # silence pke warnings
slog = logging.getLogger("sciencesearch")
slog.setLevel(logging.WARNING)
from sciencesearch.nlp.visualize_kws import JsonView
from pathlib import Path

In [2]:
textdir = Path.cwd().parent / "data" / "jft"

epsilon = 0.1
max_alg = 5

In [3]:
hyperparameter = Hyper()

## Set up parameter sweeps
The `Sweep` class from the `sciencesearch.nlp.sweep` module is used to configure the algorithm and range of parameters to use in the hyperparameter tuning.
The list of possible parameters is shown with the `.print_params` method of each algorithm class. Note that these include a set of parameters shared across all the algorithms, for which there are reasonable defaults.

In [4]:
Yake.print_params()
sweep = Sweep(alg=Yake)
sweep.set_param_range("ws", lb=1, ub=3, step=1)
sweep.set_param_discrete("dedup", [0.8, 0.9, 0.95])
sweep.set_param_discrete("dedup_method", ["leve", "seqm"]) # jaro
hyperparameter.add_sweep(sweep)

Common:
  - Stopwords stopwords: Stopwords. Default is None
  - bool stemming: Whether to do stemming. Default is False
  - int num_keywords: How many keywords to extract. Default is 10
  - list keyword_sort: sort orderings: occ (number of occurrences), score, or a dict with weights for each of these keys, e.g., {'occ': 0.75, 'score': 0.25}, and additionally a flag 'i' for ignoring keyword case. Default is []
Yake:
  - int ws: YAKE window size. Default is 2
  - float dedup: Deduplication limit for YAKE. Default is 0.9
  - str dedup_method: method ('leve', 'seqm' or 'jaro'). Default is leve
  - int ngram: Maximum ngram size. Default is 2


In [5]:
Rake.print_params()
sweep = Sweep(alg=Rake)
sweep.set_param_range("min_len", lb=1, ub=1, step=1)
sweep.set_param_range("max_len", lb=1, ub=3, step=1)
sweep.set_param_range("min_kw_occ", lb=1, ub=10, step=1)
sweep.set_param_discrete("include_repeated_phrases", [False, True])
hyperparameter.add_sweep(sweep)

Common:
  - Stopwords stopwords: Stopwords. Default is None
  - bool stemming: Whether to do stemming. Default is False
  - int num_keywords: How many keywords to extract. Default is 10
  - list keyword_sort: sort orderings: occ (number of occurrences), score, or a dict with weights for each of these keys, e.g., {'occ': 0.75, 'score': 0.25}, and additionally a flag 'i' for ignoring keyword case. Default is []
Rake:
  - int min_len: Minimum ngram size. Default is 1
  - int max_len: Maximum ngram size. Default is 3
  - int min_kw_len: Minimum keyword length. Applied as post-processing filter.. Default is 3
  - int min_kw_occ: Mimumum number of occurences of keyword in text string.Applied as post-processing filter.. Default is 4
  - Any ranking_metric: ranking parameter for rake algorithm. Default is Metric.DEGREE_TO_FREQUENCY_RATIO
  - bool include_repeated_phrases: boolean for determining whether multiple of the same keywords are output by rake. Default is True


In [7]:
KPMiner.print_params()
sweep = Sweep(alg=KPMiner)
sweep.set_param_range("lasf", lb=1, ub=3, step=1)
# zomg this takes forever..
#sweep.set_param_range("cutoff", lb=200, ub=1300, nsteps=5)
#sweep.set_param_range("alpha", lb=3.0, ub=4.0, step=0.2)
#sweep.set_param_range("sigma", lb=2.6, ub=3.2, step=0.2)
hyperparameter.add_sweep(sweep)

Common:
  - Stopwords stopwords: Stopwords. Default is None
  - bool stemming: Whether to do stemming. Default is False
  - int num_keywords: How many keywords to extract. Default is 10
  - list keyword_sort: sort orderings: occ (number of occurrences), score, or a dict with weights for each of these keys, e.g., {'occ': 0.75, 'score': 0.25}, and additionally a flag 'i' for ignoring keyword case. Default is []
KPMiner:
  - int lasf: Last allowable seen frequency. Default is 3
  - int cutoff: Cutoff threshold for number of words after which if a phrase appears for the first time it is ignored. Default is 400
  - float alpha: Weight-adjustment parameter 1 for boosting factor.See original paper for definition. Default is 2.3
  - float sigma: Weight-adjustment parameter 2 for boosting factor.See original paper for definition. Default is 3.0
  - object doc_freq_info: Document frequency counts. Default (None) uses the semeval2010 countsprovided in 'df-semeval2010.tsv.gz'. Default is None


## Train and run models
In this example, we pick the 'best' result for each algorithm by training on two files with some user-provided keywords.
Then we extract keywords from a third file using the trained model.

We save the results of the hyperparameter training in a serialize Python "pickle" file so we don't need to repeat the training.
We could run the same hyperparameters on multiple files without retraining with `run_hyper()`

In [8]:
s = Searcher()
demo = Searcher.from_config("seach_vis_config.json")

     

In [9]:
json_viewer = JsonView(demo)

In [None]:
# json_viewer.save_predicted_keywords('predicted_keywords.json')
JsonView.visualize_from_config(config_file="seach_vis_config.json", is_singleset=True, json_file="./results/predicted_keywords.json", save_file_prefix="predicted_keywords")

In [11]:
# json_viewer.save_all_keyword_sets('keywords_all_sets.json')
JsonView.visualize_from_config(config_file="seach_vis_config.json", is_singleset=False, json_file="./results/keywords_all_sets.json", save_file_prefix="keywords_all_sets")

{'training': ['Aya', 'daimyo', 'Lady Aya', 'moon', 'maidens', 'garden', 'Lord,Lord of Ako'], 'tuned': ['moon', 'Lady Aya', 'maidens', 'Aya', 'garden gallery', 'aya', 'sixteen years', 'lady aya', 'flowers', 'lord', 'Lady', 'lady', 'daimyo', 'garden', 'sweet', 'young Lord', 'ako']}
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI
MULTI

<h3> Extra <h3>

In [None]:

import os

# DATA_DIR = __this_dir.parent.parent / "data"  # could be off
# DEFAULTS = {"stopwords_file": DATA_DIR / "stopwords_en.txt"}


current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
parent_parent_dir = os.path.dirname(parent_dir)


print(f"Current directory: {current_dir}")
print(f"Parent directory: {parent_dir}")
print(f"Parent directory: {parent_parent_dir}")
print(f"Parent directory: {parent_parent_dir}/examples/data")

Current directory: /Users/sufikaur/Documents/metadata-project/sufi/sciencesearch/examples
Parent directory: /Users/sufikaur/Documents/metadata-project/sufi/sciencesearch
Parent directory: /Users/sufikaur/Documents/metadata-project/sufi
Parent directory: /Users/sufikaur/Documents/metadata-project/sufi/examples/data
