<h2> Testing search visualization <h2>

<h3> Search Pipeline <h3>

In [1]:
# imports
from pathlib import Path
import pickle
from sciencesearch.nlp.hyper import Hyper, algorithms_from_results
from sciencesearch.nlp.sweep import Sweep
from sciencesearch.nlp.models import Rake, Yake, KPMiner, Ensemble
from sciencesearch.nlp.train import train_hyper, load_hyper, run_hyper
from sciencesearch.nlp.search import Searcher
from operator import attrgetter
# logging
import logging
logging.root.setLevel(logging.ERROR)  # silence pke warnings
slog = logging.getLogger("sciencesearch")
slog.setLevel(logging.WARNING)
from sciencesearch.nlp.visualize_kws import SingleSet_Visualizer, ResultsJson, HTMLBuilder
import os
from pathlib import Path
import json

In [2]:
textdir = Path.cwd().parent / "data" / "jft"

epsilon = 0.1
max_alg = 5

In [3]:
hyperparameter = Hyper()

## Set up parameter sweeps
The `Sweep` class from the `sciencesearch.nlp.sweep` module is used to configure the algorithm and range of parameters to use in the hyperparameter tuning.
The list of possible parameters is shown with the `.print_params` method of each algorithm class. Note that these include a set of parameters shared across all the algorithms, for which there are reasonable defaults.

In [4]:
Yake.print_params()
sweep = Sweep(alg=Yake)
sweep.set_param_range("ws", lb=1, ub=3, step=1)
sweep.set_param_discrete("dedup", [0.8, 0.9, 0.95])
sweep.set_param_discrete("dedup_method", ["leve", "seqm"]) # jaro
hyperparameter.add_sweep(sweep)

Common:
  - Stopwords stopwords: Stopwords. Default is None
  - bool stemming: Whether to do stemming. Default is False
  - int num_keywords: How many keywords to extract. Default is 10
  - list keyword_sort: sort orderings: occ (number of occurrences), score, or a dict with weights for each of these keys, e.g., {'occ': 0.75, 'score': 0.25}, and additionally a flag 'i' for ignoring keyword case. Default is []
Yake:
  - int ws: YAKE window size. Default is 2
  - float dedup: Deduplication limit for YAKE. Default is 0.9
  - str dedup_method: method ('leve', 'seqm' or 'jaro'). Default is leve
  - int ngram: Maximum ngram size. Default is 2


In [5]:
Rake.print_params()
sweep = Sweep(alg=Rake)
sweep.set_param_range("min_len", lb=1, ub=1, step=1)
sweep.set_param_range("max_len", lb=1, ub=3, step=1)
sweep.set_param_range("min_kw_occ", lb=1, ub=10, step=1)
sweep.set_param_discrete("include_repeated_phrases", [False, True])
hyperparameter.add_sweep(sweep)

Common:
  - Stopwords stopwords: Stopwords. Default is None
  - bool stemming: Whether to do stemming. Default is False
  - int num_keywords: How many keywords to extract. Default is 10
  - list keyword_sort: sort orderings: occ (number of occurrences), score, or a dict with weights for each of these keys, e.g., {'occ': 0.75, 'score': 0.25}, and additionally a flag 'i' for ignoring keyword case. Default is []
Rake:
  - int min_len: Minimum ngram size. Default is 1
  - int max_len: Maximum ngram size. Default is 3
  - int min_kw_len: Minimum keyword length. Applied as post-processing filter.. Default is 3
  - int min_kw_occ: Mimumum number of occurences of keyword in text string.Applied as post-processing filter.. Default is 4
  - Any ranking_metric: ranking parameter for rake algorithm. Default is Metric.DEGREE_TO_FREQUENCY_RATIO
  - bool include_repeated_phrases: boolean for determining whether multiple of the same keywords are output by rake. Default is True


In [6]:
# KPMiner.print_params()
# sweep = Sweep(alg=KPMiner)
# sweep.set_param_range("lasf", lb=1, ub=3, step=1)
# # zomg this takes forever..
# #sweep.set_param_range("cutoff", lb=200, ub=1300, nsteps=5)
# #sweep.set_param_range("alpha", lb=3.0, ub=4.0, step=0.2)
# #sweep.set_param_range("sigma", lb=2.6, ub=3.2, step=0.2)
# hyperparameter.add_sweep(sweep)

## Train and run models
In this example, we pick the 'best' result for each algorithm by training on two files with some user-provided keywords.
Then we extract keywords from a third file using the trained model.

We save the results of the hyperparameter training in a serialize Python "pickle" file so we don't need to repeat the training.
We could run the same hyperparameters on multiple files without retraining with `run_hyper()`

In [7]:
import json

     

In [8]:
conf_file = json.load(open("seach_vis_config.json"))


In [9]:

s = Searcher()



In [10]:
demo = Searcher.from_config("seach_vis_config.json")


UnboundLocalError: local variable 'file_keywords' referenced before assignment

In [None]:
type(demo)


sciencesearch.nlp.search.Searcher

In [None]:
rj = ResultsJson()

In [None]:
rj.save_results(demo, 'predicted_keywords.json')

km {'file2.txt': ['Aya', 'maidens', 'young Lord', 'lord', 'garden gallery', 'aya', 'ako', 'Lady', 'Lady Aya', 'garden', 'lady aya', 'lady', 'sweet', 'sixteen years', 'daimyo', 'moon', 'flowers'], 'file3.txt': ['Tomodata', 'tomodata', 'Lord', 'quest', 'horse', 'willow trees', 'samurai', 'Long-haired maiden', 'willow', 'rode', 'noto', 'maiden', 'sweet', 'green', 'lord', 'Noto', 'Green Willow', 'love', 'daimyo'], 'file1.txt': ['loved', 'brothers', 'sing', 'stone', 'hunter', 'grey', 'hands', 'hand', 'gods', 'dreamer', 'wandered', 'brother', 'sound', 'shrine', 'night'], 'file4.txt': ['Mistress Tassel', 'father', 'man', 'SUSA', 'Wise', 'wise man', 'flowers', 'china', 'Heaven', 'Man', 'Wise Man', 'High Heaven', 'young', 'cried', 'jofuku', 'young man', 'stood', 'gods', 'head', 'wise'], 'file5.txt': ['PROJECT GUTENBERG', 'bright', 'tori', 'wife', 'Také Tori', 'Gutenberg Literary', 'také', 'jewel', 'PROJECT', 'big green', 'GUTENBERG', 'bamboo', 'cried', 'electronic works', 'work', 'také tori', '

In [None]:
# f = open('predicted_keywords.json',)

with open('predicted_keywords.json') as json_data:
    d = json.load(json_data)
print(d)


#  def __init__(self, keywords: list[str], text: str, filename: str, class_name: str = "keyword"):
#         super().__init__(text=text, filename=filename)





{'file2.txt': ['Aya', 'maidens', 'young Lord', 'lord', 'garden gallery', 'aya', 'ako', 'Lady', 'Lady Aya', 'garden', 'lady aya', 'lady', 'sweet', 'sixteen years', 'daimyo', 'moon', 'flowers'], 'file3.txt': ['Tomodata', 'tomodata', 'Lord', 'quest', 'horse', 'willow trees', 'samurai', 'Long-haired maiden', 'willow', 'rode', 'noto', 'maiden', 'sweet', 'green', 'lord', 'Noto', 'Green Willow', 'love', 'daimyo'], 'file1.txt': ['loved', 'brothers', 'sing', 'stone', 'hunter', 'grey', 'hands', 'hand', 'gods', 'dreamer', 'wandered', 'brother', 'sound', 'shrine', 'night'], 'file4.txt': ['Mistress Tassel', 'father', 'man', 'SUSA', 'Wise', 'wise man', 'flowers', 'china', 'Heaven', 'Man', 'Wise Man', 'High Heaven', 'young', 'cried', 'jofuku', 'young man', 'stood', 'gods', 'head', 'wise'], 'file5.txt': ['PROJECT GUTENBERG', 'bright', 'tori', 'wife', 'Také Tori', 'Gutenberg Literary', 'také', 'jewel', 'PROJECT', 'big green', 'GUTENBERG', 'bamboo', 'cried', 'electronic works', 'work', 'také tori', 'goo

In [None]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
parent_parent_dir = os.path.dirname(parent_dir)
file_dir = f"{parent_dir}/examples/data"
print(f"Parent directory: {parent_parent_dir}/examples/data")
""""

things needed
json of file names and keywords, 
directory start
filename_start 



"""
 
for textfilename, keywords in d.items():
    print(f'filename: {textfilename}, keywords {keywords}')
    filepath = f"{file_dir}/{textfilename}"
    file = textfilename[:textfilename.find('.')]
    print(filepath)
    sskw = SingleSet_Visualizer(keywords=keywords, txt_filepath = filepath)
    # html = sskw.highlight_tokens_html()
    htmlbuilder = HTMLBuilder(visualizer=sskw, filename=f"predicted_keywords_{file}", title=textfilename)
    htmlbuilder.get_highlighted_html()
    htmlbuilder.write_file_and_run()

Parent directory: /Users/sufikaur/Documents/metadata-project/sufi/examples/data
filename: file2.txt, keywords ['aya', 'lady', 'flowers', 'daimyo', 'Lady', 'lady aya', 'Lady Aya', 'Aya', 'sixteen years', 'garden', 'young Lord', 'garden gallery', 'moon', 'ako', 'lord', 'maidens', 'sweet']
/Users/sufikaur/Documents/metadata-project/sufi/sciencesearch/examples/data/file2.txt
<class 'str'>
Aya, sweet maid, was the only child of a _daimyo_ of the Province of
Omi. Mother had she none, and her father was a noble lord and a warrior.
He was at the Court of the Shogun, or he had weighty affairs at the
capital, or he went here and there with armies and overcame his enemies.
Aya saw little of him.
Long years she dwelt with her nurse and her maidens within the walls of
her father’s castle. High walls were they and well-guarded, and at their
foot was a deep moat which was rosy with lotus flowers all the seventh
month.
When the Lady Aya was some sixteen years old her father the _daimyo_
came home vict

In [None]:
import os


# DATA_DIR = __this_dir.parent.parent / "data"  # could be off
# DEFAULTS = {"stopwords_file": DATA_DIR / "stopwords_en.txt"}


current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
parent_parent_dir = os.path.dirname(parent_dir)


print(f"Current directory: {current_dir}")
print(f"Parent directory: {parent_dir}")
print(f"Parent directory: {parent_parent_dir}")
print(f"Parent directory: {parent_parent_dir}/examples/data")

# /Users/sufikaur/Documents/metadata-project/sufi/sciencesearch/data/stopwords_en.txt

Current directory: /Users/sufikaur/Documents/metadata-project/sufi/sciencesearch/examples
Parent directory: /Users/sufikaur/Documents/metadata-project/sufi/sciencesearch
Parent directory: /Users/sufikaur/Documents/metadata-project/sufi
Parent directory: /Users/sufikaur/Documents/metadata-project/sufi/examples/data


In [None]:
searcher = Searcher()
demo = Searcher.from_config("search_config.json")

file_kw = {
    "the_flute": ["flute", "bamboo", "jealous", "Kioto", "O’Yoné", "father", "stepmother"],
    "the_good_thunder": ["Rai-den", "Thunder", "Rai-Taro", "cloud", "lightning", "Lady Kwannon", "white cloud"]
}
pickle_file = Path("hyper.pkl")
if pickle_file.exists():
    hres = load_hyper(pickle_file)
else:
    hres = train_hyper(hyperparameter, file_kw, epsilon, "hyper.pkl", directory=textdir)
# keywords = run_hyper(hres, text_file=textdir / "momotaro")
# print("Keywords for 'momotaro':")
# print("\n".join(keywords))

!!!: search_config.json


FileNotFoundError: [Errno 2] No such file or directory: 'search_config.json'

In [None]:
sudo chown -R $(whoami) /opt/anaconda3/envs/scisearch/lib/python3.10/site-packages/data