In [1]:
import pandas as pd
import json
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics.pairwise import linear_kernel
import nltk
from nltk.corpus import stopwords

import matplotlib.pyplot as plt

import numpy as np
from tqdm.auto import tqdm
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

from utils import load_json_data, get_mapping_dict, create_corpus, get_true_and_predicted, mean_average_precision

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
json_citing_train = load_json_data("./datasets/Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TRAIN.json")
json_citing_test = load_json_data("./datasets/Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TEST.json")

json_nonciting = load_json_data("./datasets/Content_JSONs/Cited_2020_Uncited_2010-2019_Cleaned_Content_22k/CLEANED_CONTENT_DATASET_cited_patents_by_2020_uncited_2010-2019.json")
json_citing_to_cited = load_json_data("./datasets/Citation_JSONs/Citation_Train.json")

citing_dataset_df = pd.DataFrame(json_citing_train)

nonciting_dataset_df = pd.DataFrame(json_nonciting)
mapping_dataset_df = pd.DataFrame(json_citing_to_cited)

mapping_dict = get_mapping_dict(mapping_dataset_df)

In [3]:
# =============================
# DATASET CLASS FOR PATENT PAIRS
# =============================

class PatentPairDataset(Dataset):
    """
    Custom PyTorch Dataset to create patent pairs.
    Each sample is a tuple: (citing_text, cited_text, label) tokenized for the chosen model.
    """
    def __init__(self, pairs, tokenizer, max_length=512):
        self.pairs = pairs  # List of tuples: (citing_text, cited_text, label)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        citing_text, cited_text, label = self.pairs[idx]
        encoding = self.tokenizer.encode_plus(
            citing_text,
            cited_text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        # Squeeze the batch dimension
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

In [4]:
# -----------------------------
# 2. Create Corpora
# -----------------------------
# Choose a text type such as 'title'; you can change to abstract, claims, etc.
citing_corpus = create_corpus(json_citing_train, 'title')
nonciting_corpus = create_corpus(json_nonciting, 'title')

# Create lookup dictionary for nonciting patents
nonciting_lookup = {doc['id']: doc['text'] for doc in nonciting_corpus}

Number of documents without title: 0
Number of documents without title: 0


In [None]:
# Choose a text type such as 'title'; you can change to abstract, claims, etc.
citing_corpus = create_corpus(json_citing_train, 'fulltext')
nonciting_corpus = create_corpus(json_nonciting, 'fulltext')

# Create lookup dictionary for nonciting patents
nonciting_lookup = {doc['id']: doc['text'] for doc in nonciting_corpus}

# -----------------------------
# 3. Build Training Pairs
# -----------------------------
# Build positive pairs using the mapping dictionary
positive_pairs = []
for citing_doc in citing_corpus:
    citing_id = citing_doc['id']
    if citing_id in mapping_dict:
        cited_ids = mapping_dict[citing_id]
        for cited_id in cited_ids:
            if cited_id in nonciting_lookup:
                positive_pairs.append((citing_doc['text'], nonciting_lookup[cited_id], 1))

# Construct negative pairs by sampling a non-cited patent
negative_pairs = []
all_nonciting_ids = list(nonciting_lookup.keys())
for citing_doc in citing_corpus:
    citing_id = citing_doc['id']
    true_cited_ids = mapping_dict.get(citing_id, [])
    if true_cited_ids:
        possible_negatives = list(set(all_nonciting_ids) - set(true_cited_ids))
        if possible_negatives:
            neg_id = random.choice(possible_negatives)
            negative_pairs.append((citing_doc['text'], nonciting_lookup[neg_id], 0))

# Combine positive and negative pairs and shuffle
all_pairs = positive_pairs + negative_pairs
random.shuffle(all_pairs)
print("Total training pairs:", len(all_pairs))

# -----------------------------
# 4. Initialize Tokenizer and Model
# -----------------------------
# Using the domain-specific model anferico/bert-for-patents
tokenizer = AutoTokenizer.from_pretrained("anferico/bert-for-patents")
model = AutoModelForSequenceClassification.from_pretrained("anferico/bert-for-patents", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# -----------------------------
# 5. Create Dataset and Trainer
# -----------------------------
dataset = PatentPairDataset(all_pairs, tokenizer, max_length=256)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

# -----------------------------
# 6. Train the Model
# -----------------------------
trainer.train()