In [7]:
import os
import pickle
from pathlib import Path
from pprint import pprint
from configparser import ConfigParser
from docembedder.hyperopt.utils import ModelHyperopt
from docembedder.models import BERTEmbedder
from hyperopt import space_eval
from docembedder.utils import SimulationSpecification

In [3]:
# set simulation specification
sim_spec = SimulationSpecification(
    year_start=1838,
    year_end=1950,
    window_size=25,
    debug_max_patents=200,
)

In [4]:
# read local config
config = ConfigParser()
_ = config.read("setup.ini")
hyper_fp = Path(config["DATA"]["hyper_dir"], "bert_trials.pkl")

# set general parameters
hype = ModelHyperopt(
    sim_spec=sim_spec,
    cpc_fp=Path(config["DATA"]["cpc_file"]),
    patent_dir=Path(config["DATA"]["patent_dir"]),
    trials=hyper_fp
)

In [None]:
# run optimization
hype.optimize(label="bert", model=BERTEmbedder, max_evals=200, n_jobs=10)

In [6]:
# display results
best_model = hype.best_model("bert", BERTEmbedder)
hype.dataframe("bert", BERTEmbedder)[:20]

Unnamed: 0,pretrained_model,loss
79,AI-Growth-Lab/PatentSBERTa,-0.128801
112,AI-Growth-Lab/PatentSBERTa,-0.128801
111,AI-Growth-Lab/PatentSBERTa,-0.128801
109,AI-Growth-Lab/PatentSBERTa,-0.128801
108,AI-Growth-Lab/PatentSBERTa,-0.128801
107,AI-Growth-Lab/PatentSBERTa,-0.128801
105,AI-Growth-Lab/PatentSBERTa,-0.128801
103,AI-Growth-Lab/PatentSBERTa,-0.128801
102,AI-Growth-Lab/PatentSBERTa,-0.128801
101,AI-Growth-Lab/PatentSBERTa,-0.128801


In [8]:
from docembedder.hyperopt.utils import PreprocessorHyperopt
from docembedder.preprocessor import Preprocessor

hype = ModelHyperopt(
    sim_spec=sim_spec,
    cpc_fp=Path(config["DATA"]["cpc_file"]),
    patent_dir=Path(config["DATA"]["patent_dir"]),
    trials=hyper_fp

)

# model instance with best settings from hyperopt (w/o preprocessor)
best_model = hype.best_model("bert", BERTEmbedder)

In [9]:
# optimizing preprocessor
hyper_prep_fp = Path(config["DATA"]["hyper_dir"], "prep_bert.pkl")
    
hype_prep = PreprocessorHyperopt(
    sim_spec=sim_spec,
    cpc_fp=Path(config["DATA"]["cpc_file"]),
    patent_dir=Path(config["DATA"]["patent_dir"]),
    trials = hyper_prep_fp
)

hype_prep.optimize("normal", best_model, Preprocessor, n_jobs=8)

# preprocessor with best settings
best_preproc = hype_prep.best_preprocessor("normal", Preprocessor)
hype_prep.dataframe("normal")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 172074.01it/s]


Unnamed: 0,keep_caps,keep_start_section,remove_non_alpha,loss
0,True,True,True,-0.133687
4,False,True,True,-0.133687
1,True,True,False,-0.130646
5,False,True,False,-0.130646
3,True,False,False,-0.128801
7,False,False,False,-0.128801
2,True,False,True,-0.124209
6,False,False,True,-0.124209


In [None]:
# optimizing the model w/ optimal preprocessor
hyper_fp_2 = Path(config["DATA"]["hyper_dir"], "bert_trials_2.pkl")

hype_2 = ModelHyperopt(
    sim_spec=sim_spec,
    cpc_fp=Path(config["DATA"]["cpc_file"]),
    patent_dir=Path(config["DATA"]["patent_dir"]),
    preprocessor=best_preproc,
    trials=hyper_fp_2,
)

hype_2.optimize(label="bert", model=BERTEmbedder, max_evals=200, n_jobs=8)

In [13]:
hype_2.dataframe("bert", BERTEmbedder).head(10)

Unnamed: 0,pretrained_model,loss
89,AI-Growth-Lab/PatentSBERTa,-0.133687
57,AI-Growth-Lab/PatentSBERTa,-0.133687
56,AI-Growth-Lab/PatentSBERTa,-0.133687
55,AI-Growth-Lab/PatentSBERTa,-0.133687
87,AI-Growth-Lab/PatentSBERTa,-0.133687
53,AI-Growth-Lab/PatentSBERTa,-0.133687
51,AI-Growth-Lab/PatentSBERTa,-0.133687
59,AI-Growth-Lab/PatentSBERTa,-0.133687
50,AI-Growth-Lab/PatentSBERTa,-0.133687
48,AI-Growth-Lab/PatentSBERTa,-0.133687


In [None]:
"""
Optimal settings for BERT:
- preprocessor:
    keep_caps: True
    keep_start_section: True             
    remove_non_alpha: True
- BERT model:
    pretrained_model: AI-Growth-Lab/PatentSBERTa
"""