In [1]:
import logging
logging.getLogger().setLevel(logging.ERROR)
from pathlib import Path
from docembedder.utils import SimulationSpecification
from docembedder.utils import run_models
from docembedder.models import CountVecEmbedder
from docembedder.models import TfidfEmbedder
from docembedder.models import D2VEmbedder
from docembedder.models import BERTEmbedder
from docembedder import DataModel
from docembedder.analysis import DocAnalysis
from configparser import ConfigParser

In [2]:
# set simulation specification
sim_spec = SimulationSpecification(
    year_start=1838,
    year_end=1850,
    window_size=21,
    window_shift=1,
)

In [3]:
# read local config
config = ConfigParser()
_ = config.read("setup.ini")
patent_dir = Path(config["DATA"]["patent_dir"])
output_fp = Path(config["DATA"]["output_dir"], "patents.h5")
cpc_fp = Path(config["DATA"]["cpc_file"])


### Calculate embeddings
Embeddings are calculated using three different methods:
- CountVec
- Tfidf
- Doc2vec
- BERT

Input parameters for each model is selected after hyperparameter optimization

In [4]:
models = {
    "countvec": CountVecEmbedder(method='sigmoid'),
    "tfidf": TfidfEmbedder(ngram_max=1,stop_words='english',stem=True, norm='l1', sublinear_tf=True, min_df=6, max_df=0.665461),
    "doc2vec":D2VEmbedder(epoch=9, min_count=7, vector_size=101),
    "BERT":BERTEmbedder(pretrained_model='AI-Growth-Lab/PatentSBERTa'),
}

In [8]:
%%time
run_models(None, models, sim_spec, patent_dir, output_fp, cpc_fp)

In [9]:
# Calculate impacts and novelties
%%time
with DataModel(output_fp, read_only=False) as data:
    analysis = DocAnalysis(data)
    for window, model in data.iterate_window_models():
        impact = analysis.patent_impacts(window, model)
        novelty = analysis.patent_novelties(window, model)