In [1]:
from pathlib import Path
from pprint import pprint
from docembedder.hyperopt.utils import ModelHyperopt
from docembedder.models import TfidfEmbedder
from docembedder.utils import SimulationSpecification
from docembedder.preprocessor import Preprocessor
from docembedder.preprocessor.oldprep import OldPreprocessor
from docembedder.classification import PatentClassification
from docembedder.analysis import _compute_cpc_cor
from multiprocessing import Pool
from docembedder.hyperopt.parallel import get_patent_data_multi
from configparser import ConfigParser
import pickle
import numpy as np

In [2]:
sim_spec = SimulationSpecification(
    year_start=1838,
    year_end=1950,
    window_size=25,
    debug_max_patents=200,
)
config = ConfigParser()
_ = config.read("setup.ini")
hyper_fp = Path(config["DATA"]["hyper_dir"], "tfidf_trials.pkl")

In [3]:
# set general parameters
hype = ModelHyperopt(
    sim_spec=sim_spec,
    cpc_fp=Path(config["DATA"]["cpc_file"]),
    patent_dir=Path(config["DATA"]["patent_dir"]),
    trials=hyper_fp
)

In [4]:
hype.optimize(label="tfidf", model=TfidfEmbedder, max_evals=160, n_jobs=10)

In [5]:
best_model = hype.best_model("tfidf", TfidfEmbedder)
hype.dataframe("tfidf", TfidfEmbedder)

Unnamed: 0,max_df,min_df,ngram_max,norm,stem,stop_words,sublinear_tf,loss
119,0.528471,9,1,l1,True,english,True,-0.157901
110,0.512683,7,1,l1,True,english,True,-0.157847
141,0.509183,8,1,l1,True,english,True,-0.157826
148,0.503484,8,1,l1,True,english,True,-0.157819
145,0.506128,8,1,l1,True,english,True,-0.157818
...,...,...,...,...,...,...,...,...
40,0.537952,1,2,,False,,True,-0.079427
99,0.603661,5,1,,True,english,True,-0.078720
82,0.715661,3,1,,True,english,True,-0.077535
8,0.944509,3,3,,False,,False,-0.076444


In [6]:
from docembedder.hyperopt.utils import PreprocessorHyperopt
hyper_prep_fp = Path(config["DATA"]["hyper_dir"], "prep_tfidf.pkl")

hype_prep = PreprocessorHyperopt(
    sim_spec=sim_spec,
    cpc_fp = Path(config["DATA"]["cpc_file"]),
    patent_dir = Path(config["DATA"]["patent_dir"]),
    trials = hyper_prep_fp
)

In [7]:
hype_prep.optimize("normal", best_model, Preprocessor, n_jobs=8)

100%|███████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 157532.54it/s]


In [8]:
hype_prep.dataframe("normal")

Unnamed: 0,keep_caps,keep_start_section,remove_non_alpha,loss
0,True,True,True,-0.159937
4,False,True,True,-0.159937
3,True,False,False,-0.157901
7,False,False,False,-0.157901
1,True,True,False,-0.157654
5,False,True,False,-0.157654
2,True,False,True,-0.156188
6,False,False,True,-0.156188


In [9]:
hype_prep.optimize("old", best_model, OldPreprocessor, n_jobs=8)

100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 31536.12it/s]


In [10]:
hype_prep.dataframe("old")

Unnamed: 0,loss
0,-0.160009


In [11]:
hyper_fp_2 = Path(config["DATA"]["hyper_dir"], "tfidf_trials_2.pkl")

hype_2 = ModelHyperopt(
    sim_spec=sim_spec,
    cpc_fp=Path(config["DATA"]["cpc_file"]),
    patent_dir=Path(config["DATA"]["patent_dir"]),
    preprocessor=hype_prep.best_preprocessor("normal", Preprocessor),
    trials=hyper_fp_2,
)

In [12]:
hype_2.optimize(label="tfidf2", model=TfidfEmbedder, max_evals=160, n_jobs=9)

In [13]:
hype_2.dataframe("tfidf2", TfidfEmbedder).head(10)

Unnamed: 0,max_df,min_df,ngram_max,norm,stem,stop_words,sublinear_tf,loss
69,0.665461,6,1,l1,False,english,True,-0.160155
68,0.666169,6,1,l1,False,english,True,-0.160149
155,0.760838,6,1,l1,False,english,True,-0.160139
70,0.658734,6,1,l1,False,english,True,-0.160129
148,0.659343,6,1,l1,False,english,True,-0.160128
73,0.662721,6,1,l1,False,english,True,-0.160115
57,0.75399,6,1,l1,False,english,True,-0.160107
145,0.673232,6,1,l1,False,english,True,-0.160094
42,0.684051,7,1,l1,False,english,True,-0.160089
95,0.671353,6,1,l1,False,english,True,-0.160078


In [14]:
new_hype = PreprocessorHyperopt(
    sim_spec=sim_spec,
    cpc_fp = Path(config["DATA"]["cpc_file"]),
    patent_dir = Path(config["DATA"]["patent_dir"]),
)

In [15]:
new_hype.optimize("normal", hype_2.best_model("tfidf2", TfidfEmbedder), Preprocessor, n_jobs=8,
                  lexicon_path="../data/unigram_freq.csv")

100%|████████████████████████████████████████████████████████████████| 8/8 [1:09:43<00:00, 522.98s/it]


In [23]:
new_hype.dataframe("normal").loss

0   -0.160205
4   -0.160205
3   -0.156103
7   -0.156103
1   -0.155361
5   -0.155361
2   -0.152430
6   -0.152430
Name: loss, dtype: float64

In [17]:
new_hype.optimize("normal-no-dict", hype_2.best_model("tfidf2", TfidfEmbedder), Preprocessor, n_jobs=8)

100%|████████████████████████████████████████████████████████████████| 8/8 [1:06:47<00:00, 500.99s/it]


In [22]:
new_hype.dataframe("normal-no-dict").loss

0   -0.160155
4   -0.160155
3   -0.156219
7   -0.156219
1   -0.155412
5   -0.155412
2   -0.152397
6   -0.152397
Name: loss, dtype: float64