In [1]:
from pathlib import Path
from pprint import pprint
from docembedder.hyperopt.utils import ModelHyperopt
from docembedder.models import TfidfEmbedder
from docembedder.utils import SimulationSpecification
from docembedder.preprocessor import Preprocessor
from docembedder.classification import PatentClassification
from docembedder.analysis import _compute_cpc_cor
from multiprocessing import Pool
from docembedder.hyperopt.parallel import get_patent_data, get_patent_data_multi
from configparser import ConfigParser
from utils import dataframe_from_trials
import pickle
import numpy as np

In [2]:
sim_spec = SimulationSpecification(
    year_start=1838,
    year_end=1950,
    window_size=25,
    debug_max_patents=200,
)
config = ConfigParser()
_ = config.read("setup.ini")
hyper_fp = Path(config["DATA"]["hyper_dir"], "tfidf.pkl")

In [3]:
if hyper_fp.is_file():
    with open(hyper_fp, "rb") as handle:
        hype = pickle.load(handle)
else:
    # set general parameters
    hype = ModelHyperopt(
        sim_spec=sim_spec,
        preprocessors=None,
        cpc_fp = Path(config["DATA"]["cpc_file"]),
        patent_dir = Path(config["DATA"]["patent_dir"]),
    )

In [4]:
hype.optimize(label="tfidf", model=TfidfEmbedder, max_evals=300, pickle_fp=hyper_fp, n_jobs=10)

100%|███████████████████████████████████████████| 129/129 [06:34<00:00,  3.06s/it]


100%|██████| 110/110 [41:13<00:00, 247.37s/trial, best loss: -0.15769446965765604]
100%|██████| 120/120 [40:54<00:00, 245.49s/trial, best loss: -0.15790126305358154]
100%|██████| 130/130 [41:26<00:00, 248.65s/trial, best loss: -0.15790126305358154]
100%|██████| 140/140 [29:05<00:00, 174.56s/trial, best loss: -0.15790126305358154]
100%|██████| 150/150 [36:57<00:00, 221.80s/trial, best loss: -0.15790126305358154]
100%|██████| 160/160 [40:13<00:00, 241.34s/trial, best loss: -0.15790126305358154]
 95%|█████▋| 161/170 [04:00<36:04, 240.50s/trial, best loss: -0.15790126305358154]

Process ForkPoolWorker-627:

Process ForkPoolWorker-625:

Process ForkPoolWorker-628:

Process ForkPoolWorker-621:

Process ForkPoolWorker-629:

Process ForkPoolWorker-622:

Process ForkPoolWorker-626:

Process ForkPoolWorker-623:

Process ForkPoolWorker-630:

Process ForkPoolWorker-624:

Traceback (most recent call last):

Traceback (most recent call last):

Traceback (most recent call last):

Traceback (most recent call last):

Traceback (most recent call last):

Traceback (most recent call last):

Traceback (most recent call last):

Traceback (most recent call last):

Traceback (most recent call last):

  File "/home/qubix/.pyenv/versions/3.10.4/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
    self.run()

Traceback (most recent call last):

  File "/home/qubix/.pyenv/versions/3.10.4/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
    self.run()

  File "/home/qubix/.pyenv/versions/3.10.4/lib/python3.10/multiprocessing/process.py", line 108,


KeyboardInterrupt

  File "/home/qubix/.pyenv/versions/3.10.4/lib/python3.10/site-packages/sklearn/feature_extraction/text.py", line 1209, in _count_vocab
    for feature in analyze(doc):

  File "/home/qubix/.pyenv/versions/3.10.4/lib/python3.10/site-packages/sklearn/feature_extraction/text.py", line 113, in _analyze
    doc = tokenizer(doc)

  File "/home/qubix/Documents/shared_work/patents/patent-breakthrough/docembedder/models/tfidf.py", line 19, in _tokenizer
    return [stemmer.stem(item) for item in tokens]

  File "/home/qubix/Documents/shared_work/patents/patent-breakthrough/docembedder/models/tfidf.py", line 19, in <listcomp>
    return [stemmer.stem(item) for item in tokens]

  File "/home/qubix/.pyenv/versions/3.10.4/lib/python3.10/site-packages/nltk/stem/snowball.py", line 1704, in stem
    if word.endswith(suffix):

KeyboardInterrupt



 95%|███▊| 161/170 [08:09<1:13:22, 489.14s/trial, best loss: -0.15790126305358154]



KeyboardInterrupt



In [5]:
dataframe_from_trials(hype.trials["tfidf"], TfidfEmbedder).sort_values("loss").head(20)

Unnamed: 0,max_df,min_df,ngram_max,norm,stem,stop_words,sublinear_tf,loss
119,0.528471,9,1,l1,True,english,True,-0.157901
110,0.512683,7,1,l1,True,english,True,-0.157847
141,0.509183,8,1,l1,True,english,True,-0.157826
148,0.503484,8,1,l1,True,english,True,-0.157819
145,0.506128,8,1,l1,True,english,True,-0.157818
158,0.436369,9,1,l1,True,english,True,-0.157815
72,0.517617,5,1,l1,True,english,True,-0.157694
114,0.531151,7,1,l1,True,english,True,-0.157678
138,0.350798,3,1,l1,False,english,True,-0.157667
147,0.483636,8,1,l1,True,english,True,-0.157655
