In [1]:
from pathlib import Path
from pprint import pprint
from docembedder.hyperopt.utils import ModelHyperopt
from docembedder.models import CountVecEmbedder
from docembedder.utils import SimulationSpecification
from docembedder.preprocessor import Preprocessor
from docembedder.preprocessor.oldprep import OldPreprocessor
from configparser import ConfigParser

In [2]:
sim_spec = SimulationSpecification(
    year_start=1838,
    year_end=1950,
    window_size=25,
    debug_max_patents=200,
)
config = ConfigParser()
_ = config.read("setup.ini")
hyper_fp = Path(config["DATA"]["hyper_dir"], "count_trials.pkl")

In [3]:
# set general parameters
hype = ModelHyperopt(
    sim_spec=sim_spec,
    cpc_fp=Path(config["DATA"]["cpc_file"]),
    patent_dir=Path(config["DATA"]["patent_dir"]),
    trials=hyper_fp
)

In [4]:
hype.optimize(label="count", model=CountVecEmbedder, max_evals=2, n_jobs=10)

In [5]:
best_model = hype.best_model("count", CountVecEmbedder)
hype.dataframe("count", CountVecEmbedder)

Unnamed: 0,method,loss
1,prop,-0.078011
0,sigmoid,-0.046946


In [6]:
from docembedder.hyperopt.utils import PreprocessorHyperopt
hyper_prep_fp = Path(config["DATA"]["hyper_dir"], "prep_count.pkl")

hype_prep = PreprocessorHyperopt(
    sim_spec=sim_spec,
    cpc_fp = Path(config["DATA"]["cpc_file"]),
    patent_dir = Path(config["DATA"]["patent_dir"]),
    trials = hyper_prep_fp
)

In [7]:
hype_prep.optimize("normal", best_model, Preprocessor, n_jobs=8)

100%|███████████████████████████████████████████| 8/8 [00:00<00:00, 196224.75it/s]


In [8]:
hype_prep.dataframe("normal")

Unnamed: 0,keep_caps,keep_start_section,remove_non_alpha,loss
6,False,False,True,-0.0812
4,False,True,True,-0.079518
2,True,False,True,-0.07892
0,True,True,True,-0.078146
5,False,True,False,-0.078138
7,False,False,False,-0.078011
1,True,True,False,-0.077095
3,True,False,False,-0.076738


In [9]:
hype_prep.optimize("old", best_model, OldPreprocessor, n_jobs=8)

100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 30840.47it/s]


In [10]:
hype_prep.dataframe("old")

Unnamed: 0,loss
0,-0.07692


In [11]:
hyper_fp_2 = Path(config["DATA"]["hyper_dir"], "count_trials_2.pkl")

hype_2 = ModelHyperopt(
    sim_spec=sim_spec,
    cpc_fp=Path(config["DATA"]["cpc_file"]),
    patent_dir=Path(config["DATA"]["patent_dir"]),
    preprocessor=OldPreprocessor(),
    trials=hyper_fp_2,
)

In [12]:
hype_2.optimize(label="count-old", model=CountVecEmbedder, max_evals=2, n_jobs=9)

In [13]:
hype_2.dataframe("count-old", CountVecEmbedder).head(10)

Unnamed: 0,method,loss
1,sigmoid,-0.126032
0,prop,-0.07692
