In [1]:
from pathlib import Path
from pprint import pprint
from docembedder.hyperopt.utils import ModelHyperopt
from docembedder.models import TfidfEmbedder
from docembedder.utils import SimulationSpecification
from docembedder.preprocessor import Preprocessor
from docembedder.classification import PatentClassification
from docembedder.analysis import _compute_cpc_cor
from multiprocessing import Pool
from docembedder.hyperopt.parallel import get_patent_data_multi
from configparser import ConfigParser
from utils import dataframe_from_trials
import pickle
import numpy as np

In [2]:
sim_spec = SimulationSpecification(
    year_start=1838,
    year_end=1950,
    window_size=25,
    debug_max_patents=200,
)
config = ConfigParser()
_ = config.read("setup.ini")
hyper_fp = Path(config["DATA"]["hyper_dir"], "tfidf_trials.pkl")

In [3]:
if hyper_fp.is_file():
    with open(hyper_fp, "rb") as handle:
        trials = pickle.load(handle)
else:
    trials = None

# set general parameters
hype = ModelHyperopt(
    sim_spec=sim_spec,
    preprocessors=None,
    cpc_fp = Path(config["DATA"]["cpc_file"]),
    patent_dir = Path(config["DATA"]["patent_dir"]),
    trials = trials
)

In [4]:
hype.optimize_model(label="tfidf", model=TfidfEmbedder, max_evals=160, pickle_fp=hyper_fp, n_jobs=10)

In [13]:
result_df = dataframe_from_trials(hype.trials["tfidf"], TfidfEmbedder).sort_values("loss")
best_model_param = {key: list(value.values())[0] for key, value in result_df.head(1).to_dict().items()
                    if key != "loss"}
best_model = TfidfEmbedder(**best_model_param)
result_df.head(10)

Unnamed: 0,max_df,min_df,ngram_max,norm,stem,stop_words,sublinear_tf,loss
119,0.528471,9,1,l1,True,english,True,-0.157901
110,0.512683,7,1,l1,True,english,True,-0.157847
141,0.509183,8,1,l1,True,english,True,-0.157826
148,0.503484,8,1,l1,True,english,True,-0.157819
145,0.506128,8,1,l1,True,english,True,-0.157818
158,0.436369,9,1,l1,True,english,True,-0.157815
72,0.517617,5,1,l1,True,english,True,-0.157694
114,0.531151,7,1,l1,True,english,True,-0.157678
138,0.350798,3,1,l1,False,english,True,-0.157667
147,0.483636,8,1,l1,True,english,True,-0.157655


In [8]:
hyper_prep_fp = Path(config["DATA"]["hyper_dir"], "prep_trials.pkl")
if hyper_prep_fp.is_file():
    with open(hyper_prep_fp, "rb") as handle:
        trials = pickle.load(handle)
else:
    trials = None

# set general parameters
hype_prep = ModelHyperopt(
    sim_spec=sim_spec,
    preprocessors=None,
    cpc_fp = Path(config["DATA"]["cpc_file"]),
    patent_dir = Path(config["DATA"]["patent_dir"]),
    trials = trials
)

In [15]:
hype_prep.optimize_preprocessor(
    label="prep-tfidf", model=best_model, preprocessor=Preprocessor, max_evals=15,
    n_jobs=10,
    pickle_fp=hyper_prep_fp)

 87%|█████▏| 13/15 [1:11:56<20:33, 616.63s/trial, best loss: -0.15993703229057157]


In [18]:
result_df = dataframe_from_trials(hype_prep.trials["prep-tfidf"], Preprocessor).sort_values("loss")
best_prep_param = {key: list(value.values())[0] for key, value in result_df.head(1).to_dict().items()
                    if key != "loss"}
result_df

Unnamed: 0,keep_caps,keep_start_section,remove_non_alpha,loss
1,True,True,True,-0.159937
3,True,True,True,-0.159937
7,False,False,False,-0.157901
11,False,False,False,-0.157901
12,True,False,False,-0.157901
10,True,True,False,-0.157654
0,True,False,True,-0.156188
2,True,False,True,-0.156188
4,True,False,True,-0.156188
5,False,False,True,-0.156188
