In [1]:
# This notebook illustrates the complete analysis process from preparing input files
# to calculating impact and novelty scores.

In [2]:
# 1. Preparing input files

# In its raw format, the input file contains the text of one patent file per line.
# Each line starts with a path pointing to that patent's original text 
# file ("/Volumes/External/txt/0000000-0100000/US1009.txt"), followed by the patent text. 

# The included input-file contains a small subset of the patent data, for test purposes only.
# The year file contains the year of publication of each patent.
# The CPC-file (Cooperative Patent Classification) contains patent classification codes.
# These codes are used to calculate benchmark similarities.

# Note: the included data files (raw_input.txt, year.csv GPCPCs.txt) only contain a small
# subset of the original data, for example purposes only.

In [3]:
from pathlib import Path

input_file = Path("./data/raw_input.txt")
year_file = Path("./data/year.csv")
cpc_fp = Path("./data/GPCPCs.txt")
patent_dir = Path("./patents")
output_fp = Path("./output", "patents.h5")
results_fp = Path("./results")

In [None]:
from docembedder.preprocessor.parser import compress_raw

# The compressor function transforms the patents to a more manageable format,
# sorts them by year of publication, and compresses the files. 

if len([path for path in patent_dir.iterdir() if path.suffix==".xz"])==0:
    print("Compressing raw files")
    patent_dir.mkdir(exist_ok=True)
    compress_raw(input_file, year_file, patent_dir)
else:
    print(f"xz-files already present in '{patent_dir}'")    

# You now have XZ-compressed files containing patents per year. Each file contains
# a list of JSON-objects, each JSON-object has the following key/values:

# - patent: patent's ID
# - file: path of original text file (not actually used)
# - contents: patent text
# - year: year of publication

In [1]:
# 2. Calculating embeddings

# Each model has its own preprocessor with various parameters. Most models also have
# configurable hyperparameters. The values for these parameters have been optimised
# using the original dataset, resulting in the values used in the compute_embeddings()-function below.

# To recalibrate preprocessor and model parameters, run each model's hyperopt-script. See the 
# hyperopt-notebooks (hyperopt/) for more details.

from docembedder.simspec import SimulationSpecification
from docembedder.models import TfidfEmbedder
from docembedder.preprocessor.preprocessor import Preprocessor
from docembedder.models.doc2vec import D2VEmbedder
from docembedder.models import CountVecEmbedder
from docembedder.models import BERTEmbedder

from docembedder.utils import run_models
from docembedder.pretrained_run import pretrained_run_models
import datetime

import polars as pl

def check_files(sim_spec):
    for year in range(sim_spec.year_start, sim_spec.year_end):
        if not (patent_dir / f"{year}.xz").is_file():
            raise ValueError(f"Please download patent file {year}.xz and put it in"
                             f"the right directory ({patent_dir})")

def compute_embeddings_cv(patent_dir, output_fp, cpc_fp, sim_spec, n_jobs):

    model_cv = {
        "countvec": CountVecEmbedder(method='sigmoid')
    }
    prep_cv = {
        "prep-countvec": Preprocessor(keep_caps=False, keep_start_section=False, remove_non_alpha=True)
    }

    check_files(sim_spec)
    run_models(prep_cv, model_cv, sim_spec, patent_dir, output_fp, cpc_fp, n_jobs=n_jobs)
    print('Calculated countvec emdeddings')

    
def compute_embeddings_tfidf(patent_dir, output_fp, cpc_fp, sim_spec, n_jobs):
    
    model_tfidf = {
        "tfidf": TfidfEmbedder(
            ngram_max=1,stop_words='english',stem=True, norm='l1', sublinear_tf=True, min_df=6, max_df=0.665461)
    }
    prep_tfidf = {
        "prep-tfidf": Preprocessor(keep_caps=True, keep_start_section=True, remove_non_alpha=True),
    }

    check_files(sim_spec)
    run_models(prep_tfidf, model_tfidf, sim_spec, patent_dir, output_fp, cpc_fp, n_jobs=n_jobs)
    print('Calculated tfidf emdeddings')

    
def compute_embeddings_doc2vec(patent_dir, output_fp, cpc_fp, sim_spec, n_jobs):

    model_doc2vec = {
        "doc2vec": D2VEmbedder(epoch=9, min_count=7, vector_size=101)
    }
    prep_doc2vec = {
        "prep-doc2vec": Preprocessor(keep_caps=False, keep_start_section=True, remove_non_alpha=False)
    }

    check_files(sim_spec)
    run_models(prep_doc2vec, model_doc2vec, sim_spec, patent_dir, output_fp, cpc_fp, n_jobs=n_jobs)
    print('Calculated doc2vec emdeddings')

def compute_embeddings_bert(patent_dir, output_fp, cpc_fp, sim_spec, n_jobs):

    model_bert = {
        "bert": BERTEmbedder(pretrained_model='AI-Growth-Lab/PatentSBERTa')
    }
    prep_bert = {
         "prep-bert": Preprocessor(keep_caps=True, keep_start_section=True, remove_non_alpha=True)
    }

    check_files(sim_spec)
    pretrained_run_models(prep_bert, model_bert, sim_spec, patent_dir, output_fp, cpc_fp)
    print('Calculated BERT emdeddings')

In [None]:
# Set specifications for the calculation of the embeddings.
# Note that the year_end itself is not included in the range
sim_spec = SimulationSpecification(
    year_start=1870,
    year_end=1911,
    window_size=11,
    window_shift=1,
#     debug_max_patents=100
)

# Number of concurrent jobs to run. A higher number means faster processing, but be aware
# that each job takes utilises one CPU-core.
jobs=2

# Calculate embeddings using all four models: Countvec, TfIdf, Doc2Vec, BERT (PatentSBERTa)
# Be aware, depending on the amlount of patents and window size, this will take quite some
# time, and can require a (very) large amount of memory. 
# For testing, you can set the debug_max_patents-attribute of the SimulationSpecification to
# restrict the number of patents per year.
# (Warnings from the Countvec calculations can be ignored).
args={'patent_dir': patent_dir, 'output_fp': output_fp, 'cpc_fp': cpc_fp, 'sim_spec': sim_spec, 'n_jobs': jobs}
compute_embeddings_cv(**args)
compute_embeddings_tfidf(**args)
compute_embeddings_doc2vec(**args)
compute_embeddings_bert(**args)

In [None]:
# 3. Calculating impact and novelty scores

from docembedder.analysis import DocAnalysis
from docembedder.datamodel import DataModel
from collections import defaultdict

import pandas as pd

def compute_impacts(embedding_fp, output_dir):
    exponents = [1.0, 2.0, 3.0]

    impact_novel = defaultdict(lambda: defaultdict(list))

    with DataModel(embedding_fp, read_only=False) as data:
        analysis = DocAnalysis(data)
       
        for window, model in data.iterate_window_models():
            results = analysis.impact_novelty_results(window, model, exponents, cache=False, n_jobs=8)

            for expon, res in results.items():
                if expon == exponents[0]:
                    impact_novel[model]["patent_ids"].extend(res["patent_ids"])
                impact_novel[model][f"impact-{expon}"].extend(res["impact"])
                impact_novel[model][f"novelty-{expon}"].extend(res["novelty"])

    output_dir.mkdir(exist_ok=True, parents=True)

    for model, data in impact_novel.items():
        classifier_name = model.split("-")[-1]
        impact_fp = Path(output_dir, f"impact-{classifier_name}.csv")
        pd.DataFrame(impact_novel[model]).sort_values("patent_ids").to_csv(impact_fp, index=False)

In [None]:
# Compute novelty and impact scores and write them to file
compute_impacts(embedding_fp=output_fp, output_dir=results_fp)

In [None]:
# Result files
[str(path.absolute()) for path in results_fp.iterdir()]