In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import datetime

In [42]:
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, cross_validate, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve
from src.utils import convert_date

data_folder = '/Path/To/Data/Folder/'

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [44]:
matches = None # Load your labeled match data here

## Training

In [None]:
features = [
    "inventor_score", 
    "doi_overlap_score", 
    "overlap_score_titleabstract_mean", 
    "semantic_score_titleabstract_sbert_mean"
]

In [59]:
x = matches[features].to_numpy()
y = matches["match"].to_numpy()

In [None]:
from sklearn.metrics import precision_recall_curve, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
import numpy as np
import matplotlib.pyplot as plt

# Initialize lists to store precision, recall, F1 scores and accuracy for each fold
precisions = []
recalls = []
f1_scores = []
accuracies = []

common_thresholds = np.linspace(0, 1, 500)

for train_indices, test_indices in StratifiedKFold(n_splits=5, shuffle=True).split(x, y):
    x_train, x_test = x[train_indices], x[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    y_pred_proba = lr.predict_proba(x_test)[:, 1]
    y_pred = lr.predict(x_test)
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_score = 2*(precision*recall)/(precision+recall)
    
    # Interpolate the precision, recall, and F1 score arrays to the common thresholds
    precisions.append(np.interp(common_thresholds, thresholds, precision[:-1]))
    recalls.append(np.interp(common_thresholds, thresholds, recall[:-1]))
    f1_scores.append(np.interp(common_thresholds, thresholds, f1_score[:-1]))
    
    # Add accuracy score
    accuracies.append(accuracy_score(y_test, y_pred))

# Convert lists to arrays for ease of calculation
precisions = np.array(precisions)
recalls = np.array(recalls)
f1_scores = np.array(f1_scores)
accuracies = np.array(accuracies)

# Calculate mean and standard deviation of precision, recall, f1 scores, and accuracy
precision_mean = np.mean(precisions, axis=0)
recall_mean = np.mean(recalls, axis=0)
f1_scores_mean = np.mean(f1_scores, axis=0)
accuracy_mean = np.mean(accuracies)

precision_std = np.std(precisions, axis=0)
recall_std = np.std(recalls, axis=0)
f1_scores_std = np.std(f1_scores, axis=0)
accuracy_std = np.std(accuracies)

# Print average accuracy and its standard deviation
print("Five-fold cross-validation results:")
print(f'Average Accuracy: {accuracy_mean:.2f} (+/- {accuracy_std:.2f})')

# Print precision, recall and F1 score at threshold 0.5
print("\nScores at threshold 0.5:")
print(f'Precision: {precision_mean[250]:.4f} +/- {precision_std[250]:.4f}')
print(f'Recall: {recall_mean[250]:.4f} +/- {recall_std[250]:.4f}')
print(f'F1 Score: {f1_scores_mean[250]:.4f} +/- {f1_scores_std[250]:.4f}')


# Print precision, recall and F1 score at threshold 0.6
print("\nScores at threshold 0.6:")
print(f'Precision: {precision_mean[300]:.4f} +/- {precision_std[300]:.4f}')
print(f'Recall: {recall_mean[300]:.4f}' + f' +/- {recall_std[300]:.4f}')
print(f'F1 Score: {f1_scores_mean[300]:.4f} +/- {f1_scores_std[300]:.4f}')

In [None]:
print("These are the coefficients")
print(features)
lr.coef_, lr.intercept_

## Comparison to other methods

In [67]:
from typing import List
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class Lissoni_2013_Model:
    def __init__(self):
        self.vectorizer = CountVectorizer(binary=True, stop_words='english')
        self.threshold = 0
        
    def fit(self, patent_documents: List[str], paper_documents: List[str]):
        all_docs = patent_documents + paper_documents
        self.vectorizer.fit(all_docs)
        patent_vectors = self.vectorizer.transform(patent_documents).toarray()
        paper_vectors = self.vectorizer.transform(paper_documents).toarray()
        self.train_scores = [cosine_similarity(patent_vector.reshape(1, -1), paper_vector.reshape(1, -1))[0][0] for patent_vector, paper_vector in zip(patent_vectors, paper_vectors)]
        
    def predict(self, patent_doc, paper_doc, threshold_percentile=0.1):
        threshold = np.percentile(self.train_scores, 100 - threshold_percentile * 100)
        vector1 = self.vectorizer.transform([patent_doc]).toarray()
        vector2 = self.vectorizer.transform([paper_doc]).toarray()
        similarity = cosine_similarity(vector1, vector2)[0][0]
        return similarity >= threshold


In [68]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

class Magerman_2015_Model:

    def __init__(self) -> None:
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))

    def predict(self, patent_doc, paper_doc, overlap_min_threshold=0.6, overlap_max_threshold=0.3):
         # Tokenize the input string
        patent_tokens = word_tokenize(patent_doc)
        paper_tokens = word_tokenize(paper_doc)

        # Remove stop words and Perform stemming
        # processed_tokens = [stemmer.stem(w) for w in word_tokens if not w in stop_words]
        patent_tokens = [self.stemmer.stem(w) for w in patent_tokens if not w in self.stop_words]
        paper_tokens = [self.stemmer.stem(w) for w in paper_tokens if not w in self.stop_words]

        # Calculate the number of common words
        common_words = len(set(patent_tokens).intersection(set(paper_tokens)))

        # Overlap over the number of words in the doc with the smaller number of words
        overlap_minimum = common_words / min(len(patent_tokens), len(paper_tokens))
        # Overlap over the number of words in the doc with the larger number of words
        overlap_maximum = common_words / max(len(patent_tokens), len(paper_tokens))
        return overlap_minimum >= overlap_min_threshold and overlap_maximum >= overlap_max_threshold



In [69]:
# Calculating precision, recall and F1 score for the different models
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

lissoni_model = Lissoni_2013_Model()
magerman_model = Magerman_2015_Model()

train_indices, test_indices = train_test_split(range(len(matches)), test_size=0.2, random_state=42)

train_patent_titles = matches.iloc[train_indices]['patent_title'].to_list()
train_patent_abstracts = matches.iloc[train_indices]['patent_abstract'].to_list()
train_paper_titles = matches.iloc[train_indices]['work_title'].to_list()
train_paper_abstracts = matches.iloc[train_indices]['work_abstract'].to_list()

# Concatenate titles and abstracts into "documents"
train_patent_documents = [title + " " + abstract for title, abstract in zip(train_patent_titles, train_patent_abstracts)]
train_paper_documents = [title + " " + abstract for title, abstract in zip(train_paper_titles, train_paper_abstracts)]

lissoni_model.fit(train_patent_documents, train_paper_documents)

In [None]:
lissoni_preds = []
magerman_preds = []
for test_index in test_indices:
    patent_title = matches.iloc[test_index]['patent_title']
    patent_abstract = matches.iloc[test_index]['patent_abstract']
    patent_document = patent_title + " " + patent_abstract
    paper_title = matches.iloc[test_index]['work_title']
    paper_abstract = matches.iloc[test_index]['work_abstract']
    paper_document = paper_title + " " + paper_abstract
    
    lissoni_preds.append(lissoni_model.predict(patent_document, paper_document))
    magerman_preds.append(magerman_model.predict(patent_document, paper_document))

scaler = StandardScaler()
x_train = matches[features].iloc[train_indices].to_numpy()
x_train = scaler.fit_transform(x_train)
x_test = matches[features].iloc[test_indices].to_numpy()
x_test = scaler.transform(x_test)
model_preds = LogisticRegression() \
    .fit(x_train, matches["match"].iloc[train_indices].to_numpy()) \
    .predict(x_test).tolist()

assert(len(lissoni_preds) == len(magerman_preds) == len(model_preds) == len(test_indices))

print("Results of one 80%train 20%test split with comparison against Lissoni and Magerman models")

print("\nLissoni Model")
print("Prediction percent: ", sum(lissoni_preds) / len(lissoni_preds))
print("Precision, Recall, F1 Score: ", precision_recall_fscore_support(matches['match'].iloc[test_indices], lissoni_preds, average='binary'))

print("\nMagerman Model")
print("Prediction percent: ", sum(magerman_preds) / len(magerman_preds))
print("Precision, Recall, F1 Score: ", precision_recall_fscore_support(matches['match'].iloc[test_indices], magerman_preds, average='binary'))

print("\nLinear Model")
print("Prediction percent: ", sum(model_preds) / len(model_preds))
print("Precision, Recall, F1 Score: ", precision_recall_fscore_support(matches['match'].iloc[test_indices], model_preds, average='binary'))


In [None]:
lissoni_preds = []
magerman_preds = []
for test_index in test_indices:
    patent_title = matches.iloc[test_index]['patent_title']
    patent_abstract = matches.iloc[test_index]['patent_abstract']
    patent_document = patent_title + " " + patent_abstract
    paper_title = matches.iloc[test_index]['work_title']
    paper_abstract = matches.iloc[test_index]['work_abstract']
    paper_document = paper_title + " " + paper_abstract
    
    lissoni_preds.append(lissoni_model.predict(patent_document, paper_document, threshold_percentile=0.45))
    magerman_preds.append(magerman_model.predict(patent_document, paper_document, overlap_min_threshold=0.11, overlap_max_threshold=0.06))

scaler = StandardScaler()
x_train = matches[features].iloc[train_indices].to_numpy()
x_train = scaler.fit_transform(x_train)
x_test = matches[features].iloc[test_indices].to_numpy()
x_test = scaler.transform(x_test)
model_preds = LogisticRegression() \
    .fit(x_train, matches["match"].iloc[train_indices].to_numpy()) \
    .predict(x_test).tolist()

assert(len(lissoni_preds) == len(magerman_preds) == len(model_preds) == len(test_indices))

print("Results of one 80%train 20%test split with comparison against Lissoni and Magerman models")
print("Here, we adjusted the Lissoni and Magerman thresholds to match the prediction rate of the linear model.")

print("Lissoni Model")
print("Prediction percent: ", sum(lissoni_preds) / len(lissoni_preds))
print("Precision, Recall, F1 Score: ", precision_recall_fscore_support(matches['match'].iloc[test_indices], lissoni_preds, average='binary'))
print()

print("Magerman Model")
print("Prediction percent: ", sum(magerman_preds) / len(magerman_preds))
print("Precision, Recall, F1 Score: ", precision_recall_fscore_support(matches['match'].iloc[test_indices], magerman_preds, average='binary'))
print()

print("Linear Model")
print("Prediction percent: ", sum(model_preds) / len(model_preds))
print("Precision, Recall, F1 Score: ", precision_recall_fscore_support(matches['match'].iloc[test_indices], model_preds, average='binary'))

## P values

In [None]:
import numpy as np
from scipy.stats import norm
from sklearn.linear_model import LogisticRegression

print("Order is: intercept, coef1 ,coef2, coef3, coef4")

def logit_p_value(model, x):
   
    p1 = model.predict_proba(x)
    n1 = len(p1)
    m1 = len(model.coef_[0]) + 1
    coefs = np.concatenate([model.intercept_, model.coef_[0]])
    print(f'Coefficients: {coefs}')
    x_full = np.matrix(np.insert(np.array(x), 0, 1, axis = 1))
    answ = np.zeros((m1, m1))
    for i in range(n1):
        answ = answ + np.dot(np.transpose(x_full[i, :]), x_full[i, :]) * p1[i,1] * p1[i, 0]
    vcov = np.linalg.inv(np.matrix(answ))
    se = np.sqrt(np.diag(vcov))
    t1 =  coefs/se
    p1 = (1 - norm.cdf(abs(t1))) * 2
    return p1

model = LogisticRegression().fit(x_train, y_train)
values = logit_p_value(model, x_train)
for value in values:
    print('{:f}'.format(value))

In [None]:
potential_matches = pd.read_parquet(data_folder + 'result/final_results.parquet')

## Predict all potential matches

In [None]:
all_potential_matches = potential_matches.dropna(subset=features)
all_pred_probas = lr.predict_proba(scaler.transform(all_potential_matches[features].to_numpy()))

# Threshold for positive matches
threshold = 0.5

all_preds = np.where(all_pred_probas[:, 1] > threshold, 1, 0)
print(all_preds.shape)
pd.Series(all_preds).value_counts()

In [None]:
all_potential_matches['model_classification'] = all_preds
todays_date = datetime.datetime.now().date().strftime("%Y-%m-%d")
all_potential_matches[["patent_id", "work_id", "model_classification"]] \
    .to_excel(data_folder + "result/excels/final_results_model_classification_{todays_date}.xlsx", index=False)
