In [None]:
from pathlib import Path
from pprint import pprint

from docembedder.hyperopt.utils import ModelHyperopt
from docembedder.models import TfidfEmbedder, D2VEmbedder, CountVecEmbedder, BPembEmbedder, BERTEmbedder

from hyperopt import space_eval

In [None]:
# set general parameters
hype = ModelHyperopt(
        year_start=1900,
        year_end=1904,
        window_size=2,
        n_jobs=1,
        preprocessors=None,
        cpc_fp = Path("/data/breakthrough-patents/year_index/GPCPCs.txt"),
        patent_dir = Path("/data/breakthrough-patents/test/"),
        debug_max_patents = 1000
    )

In [None]:
# run optimization per model (this can take quite a while!)
models = {
    'tfidf': (TfidfEmbedder, 10),
    'd2v': (D2VEmbedder, 10),
    'countvec': (CountVecEmbedder, 10),
    'bpemp': (BPembEmbedder, 10),
    'bert': (BERTEmbedder, 10),
}

for label, (model, max_evals) in models.items():
    hype.optimize(
        label=label,
        objective_function=hype.get_objective_func(label=label, model=model),
        space=model.hyper_space(),
        max_evals=max_evals
    )

In [None]:
# raw results
pprint(hype.best)
for trial in hype.trials.trials:
    pprint(trial)

# print readable evaluation results
pprint(space_eval(TfidfEmbedder.hyper_space(), hype.best['tfidf']))
pprint(space_eval(D2VEmbedder.hyper_space(), hype.best['d2v']))
pprint(space_eval(CountVecEmbedder.hyper_space(), hype.best['countvec']))
pprint(space_eval(BPembEmbedder.hyper_space(), hype.best['bpemp']))
pprint(space_eval(BERTEmbedder.hyper_space(), hype.best['bert']))