In [1]:
from docembedder.preprocessor.preprocessor import Preprocessor
from docembedder.classification import PatentClassification
from docembedder import BERTEmbedder, TfidfEmbedder
from matplotlib import pyplot as plt
import logging
from pathlib import Path
from tqdm import tqdm
from scipy import stats

In [2]:
fp = Path("../data/unprocessed/0000000-0100000\uf026GP_all_text_4.jsonl")

In [3]:
prep = Preprocessor(log_level=logging.ERROR)

In [4]:
%%time

patents, prep_stats = prep.preprocess_file(fp)

CPU times: user 44.1 s, sys: 0 ns, total: 44.1 s
Wall time: 44.1 s


In [5]:
prep_stats

{'processed': 14639, 'skipped_empty': 361, 'skipped_no_year': 0}

In [6]:
model = BERTEmbedder()
tfidf_model = TfidfEmbedder()

In [7]:
%%time

documents = [p["contents"] for p in patents][:1000]
embeddings = model.transform(documents)

CPU times: user 49 s, sys: 549 ms, total: 49.6 s
Wall time: 43.7 s


In [8]:
%%time
tfidf_model.fit(documents)
tfidf_embeddings = tfidf_model.transform(documents)

CPU times: user 457 ms, sys: 32 µs, total: 457 ms
Wall time: 457 ms


In [9]:
import numpy as np
cross_cor = np.dot(embeddings, embeddings.T)
cross_cor_tfidf = tfidf_embeddings.dot(tfidf_embeddings.T)

In [10]:
%%time

pc = PatentClassification("../data/GPCPCs.txt")
class_matrix = np.zeros((len(documents), len(documents)))

for i_patent in range(len(documents)):
    patent_id_i = patents[i_patent]["patent"]
    for j_patent in range(i_patent+1, len(documents)):
        patent_id_j = patents[j_patent]["patent"]
        class_matrix[i_patent, j_patent] = pc.get_similarity(patent_id_i, patent_id_j)

CPU times: user 1min 42s, sys: 513 ms, total: 1min 43s
Wall time: 1min 42s


In [11]:
class_cor = class_matrix[np.triu_indices(len(documents), k=1)]
model_cor = cross_cor[np.triu_indices(len(documents), k=1)]
tfidf_cor = cross_cor_tfidf[np.triu_indices(len(documents), k=1)]
tfidf_cor = np.asarray(tfidf_cor).flatten()

In [12]:
stats.spearmanr(class_cor, model_cor), stats.spearmanr(class_cor, tfidf_cor)

(SpearmanrResult(correlation=0.03405715501931589, pvalue=4.3621724684332553e-128),
 SpearmanrResult(correlation=0.11892645848988091, pvalue=0.0))