In [16]:
import os
import pickle
from pathlib import Path
from pprint import pprint
from configparser import ConfigParser
from docembedder.hyperopt.utils import ModelHyperopt
from docembedder.models import D2VEmbedder
from hyperopt import space_eval
from docembedder.utils import SimulationSpecification

In [17]:
# set simulation specification
sim_spec = SimulationSpecification(
    year_start=1838,
    year_end=1950,
    window_size=25,
    debug_max_patents=200,
)

In [18]:
# read local config
config = ConfigParser()
_ = config.read("setup.ini")
hyper_fp = Path(config["DATA"]["hyper_dir"], "doc2vec_trials.pkl")

# set general parameters
hype = ModelHyperopt(
    sim_spec=sim_spec,
    cpc_fp=Path(config["DATA"]["cpc_file"]),
    patent_dir=Path(config["DATA"]["patent_dir"]),
    trials=hyper_fp
)

In [None]:
# run optimization
hype.optimize(label="doc2vec", model=D2VEmbedder, max_evals=200, n_jobs=10)

In [20]:
# display results
best_model = hype.best_model("doc2vec", D2VEmbedder)
hype.dataframe("doc2vec", D2VEmbedder)[:20]

Unnamed: 0,epoch,min_count,vector_size,loss
22,9,7,101,-0.131289
66,8,8,101,-0.130676
107,8,12,100,-0.130575
112,9,12,104,-0.13054
146,9,15,103,-0.130532
123,8,10,112,-0.130504
65,8,8,101,-0.130472
51,9,6,101,-0.130465
25,8,8,104,-0.130421
119,9,15,100,-0.130331


In [21]:
from docembedder.hyperopt.utils import PreprocessorHyperopt
from docembedder.preprocessor import Preprocessor

hype = ModelHyperopt(
    sim_spec=sim_spec,
    cpc_fp=Path(config["DATA"]["cpc_file"]),
    patent_dir=Path(config["DATA"]["patent_dir"]),
    trials=hyper_fp

)

# model instance with best settings from hyperopt (w/o preprocessor)
best_model = hype.best_model("doc2vec", D2VEmbedder)

In [35]:
# optimizing preprocessor
hyper_prep_fp = Path(config["DATA"]["hyper_dir"], "prep_doc2vec.pkl")
    
hype_prep = PreprocessorHyperopt(
    sim_spec=sim_spec,
    cpc_fp=Path(config["DATA"]["cpc_file"]),
    patent_dir=Path(config["DATA"]["patent_dir"]),
    trials = hyper_prep_fp
)

hype_prep.optimize("normal", best_model, Preprocessor, n_jobs=8)

# preprocessor with best settings
best_preproc = hype_prep.best_preprocessor("normal", Preprocessor)
hype_prep.dataframe("normal")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 111848.11it/s]


Unnamed: 0,keep_caps,keep_start_section,remove_non_alpha,loss
5,False,True,False,-0.131346
1,True,True,False,-0.130648
3,True,False,False,-0.130548
7,False,False,False,-0.13013
0,True,True,True,-0.130027
4,False,True,True,-0.128767
2,True,False,True,-0.125252
6,False,False,True,-0.125059


In [None]:
# optimizing the model w/ optimal preprocessor
hyper_fp_2 = Path(config["DATA"]["hyper_dir"], "doc2vec_trials_2.pkl")

hype_2 = ModelHyperopt(
    sim_spec=sim_spec,
    cpc_fp=Path(config["DATA"]["cpc_file"]),
    patent_dir=Path(config["DATA"]["patent_dir"]),
    preprocessor=best_preproc,
    trials=hyper_fp_2,
)

hype_2.optimize(label="doc2vec", model=D2VEmbedder, max_evals=200, n_jobs=8)

In [34]:
hype_2.dataframe("doc2vec", D2VEmbedder).head(10)

Unnamed: 0,epoch,min_count,vector_size,loss
130,8,13,100,-0.131676
105,8,14,100,-0.131456
140,8,13,101,-0.131439
92,8,12,100,-0.131383
76,8,11,100,-0.13134
146,8,12,100,-0.131318
81,9,13,100,-0.131312
119,8,10,109,-0.131238
148,8,13,110,-0.130999
145,8,12,115,-0.130856


In [None]:
"""
Optimal settings for Doc2vec:
- preprocessor:
    keep_caps: False
    keep_start_section: True             
    remove_non_alpha: False
- Doc2vec model:
    epoch: 8
    min_count: 13
    vector_size: 100
"""