In [27]:
import logging
logging.getLogger().setLevel(logging.ERROR)
from pathlib import Path
from docembedder.utils import SimulationSpecification
from docembedder.utils import run_models
from docembedder.models import TfidfEmbedder
from docembedder.models import D2VEmbedder
from docembedder.models import BERTEmbedder
from docembedder import DataModel
from docembedder.analysis import DocAnalysis

### Set data path

In [28]:
data_dir =  Path("data")
cpc_fp = Path(data_dir / "GPCPCs.txt")
patent_dir = Path(data_dir / "unprocessed")
output_dir = Path("data", "results")
output_fp = output_dir / "tfidf.h5"

### Calculate embeddings
Embeddings are calculated using three different methods:
- TFIDF
- Doc2vec
- BERT

Input parameters for each model is selected after hyperparameter optimization

In [29]:
models = {
    "tfidf": TfidfEmbedder(ngram_max=1,stop_words='english',stem=True, norm='l1', sublinear_tf=True, min_df=6, max_df=0.665461),
    "doc2vec":D2VEmbedder(epoch=9, min_count=7, vector_size=101),
    "BERT":BERTEmbedder(pretrained_model='AI-Growth-Lab/PatentSBERTa'),
}

In [31]:
%%time
sim_spec = SimulationSpecification(1946, 1948, window_size=4)
run_models(None, models, sim_spec, patent_dir, output_fp, cpc_fp)

### Calculate impacts and novelties

In [14]:
%%time
with DataModel(output_fp, read_only=False) as data:
    analysis = DocAnalysis(data)
    for window, model in data.iterate_window_models():
        impact = analysis.patent_impacts(window, model)
        novelty = analysis.patent_novelties(window, model)