# Early baseline for claim&cited-paragraph classification on PatentMatch

Open this notebook in Google Colab.  

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/XnibyH/PatentMatch-Experiment/blob/main/notebooks/colab_notebook.ipynb)

## Colab Section

In [None]:
# download the reposotory
!git clone https://github.com/XnibyH/PatentMatch-Experiment.git
%cd PatentMatch-Experiment

# install project requirements
!pip install -U -r requirements.txt --quiet

In the cell below upload Dataset files `train.parquet` and `test.parquet` from [`Google Drive`](https://drive.google.com/drive/folders/1bReauP_LtdzBFpCk82RL3N8hvufGSr8r?usp=drive_link).

In [None]:
from google.colab import files

# # upload data
%cd data
uploaded_files = files.upload()
%cd ..

Create .env file. Change your credentials in the cell below and run it.

In [None]:
%%writefile .env
MLFLOW_TRACKING_URI= 'https://mlflow.example-server.com/'  # provide a valid mlflow server address,
MLFLOW_TRACKING_USERNAME= 'User_Name'  # your user name,
MLFLOW_TRACKING_PASSWORD= 'P455VV0RD'  # password,
MLFLOW_EXPERIMENT_NAME = 'User_Name_PatentMatchBaseline'  # and update the experiment name,
MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING = 'True'  # set 'True' to log system metrics: GPU utilization etc.

## Basic Imports

In [None]:
import os
from dotenv import load_dotenv


load_dotenv()


MLFLOW_TRACKING_URI= os.getenv('MLFLOW_TRACKING_URI')
MLFLOW_TRACKING_USERNAME= os.getenv('MLFLOW_TRACKING_USERNAME')
MLFLOW_TRACKING_PASSWORD= os.getenv('MLFLOW_TRACKING_PASSWORD')
MLFLOW_EXPERIMENT_NAME = os.getenv('MLFLOW_EXPERIMENT_NAME')
MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING = os.getenv('MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING')

## Dataset

In [None]:
import pandas as pd

### Train Data

In [None]:
# load training data
df = pd.read_parquet('data/train.parquet')

# name Unnamed: 0 column -> unnamed_col
df.columns = ['unnamed_col', 'index', 'claim_id', 'patent_application_id', 'cited_document_id', 'text', 'text_b', 'label', 'date', 'DIznQ_0']

# fix indexing
# replace NaNs in columns 'index' and 'unnamed_col', change float to int
df['index'] = df['index'].fillna(0).apply(lambda x: int(x))
df['unnamed_col'] = df['unnamed_col'].fillna(0).apply(lambda x: int(x))

# create updated_index assuming that unnamed_col and index are corresponding to each other
# updated_index == index, when index != 0
# updated_index == unnamed_col, when index == 0
df['updated_index'] = df.apply(lambda row: row['index'] if row['index'] != 0 else row['unnamed_col'], axis=1)

# drop columns: unnamed_col, index and rearrange columns order
df = df[[
        # 'unnamed_col',
        # 'index',
        'updated_index',
        'text',
        'text_b',
        'label',
        'claim_id',
        'patent_application_id',
        'cited_document_id',
        'date',
        'DIznQ_0',
        ]
        ]

# drop NaN in subset 'text', 'text_b', 'label';
# left NaN in other columns ('claim_id', 'patent_application_id', 'cited_document_id', 'date', 'DIznQ_0')
# in purpose to save as much training data as possible
df = df.dropna(subset=['updated_index', 'text', 'text_b', 'label'])

# # check duplicated with/without subsets
# len(df[df.duplicated(keep=False)])
# len(df[df.duplicated(subset=['text', 'text_b'], keep=False)])
# len(df[df.duplicated(subset=['text', 'text_b', 'label'], keep=False)])
# len(df[df.duplicated(subset=['updated_index', 'text', 'text_b', 'label'], keep=False)])

# drop duplicated
# df = df.drop_duplicates(subset=['updated_index', 'text', 'text_b', 'label'], keep='first')
df = df.drop_duplicates(subset=['text', 'text_b', 'label'], keep='first')

# change label datatype to int
# df['label'] = df['label'].apply(lambda x: int(x))  # no! float is required later in training

# measure length in chars for text and text_b
df['text_len'] = df['text'].apply(lambda x: len(x))
df['text_b_len'] = df['text_b'].apply(lambda x: len(x))

# rename column updated_index to index
df.columns = ['index', 'text', 'text_b', 'label', 'claim_id', 'patent_application_id', 'cited_document_id', 'date', 'DIznQ_0', 'text_len', 'text_b_len']


In [None]:
# Train Dataset Information

print(f"Number of samples: {len(df)}")
print(f"Distinct patent applications: {df['patent_application_id'].nunique()}")
print(f"Distinct cited documents: {df['cited_document_id'].nunique()}")
print(f"Distinct claim texts: {df['text'].nunique()}")
print(f"Distinct cited paragraphs: {df['text_b'].nunique()}")
print(f"Median claim length (chars): {df['text_len'].median()}")
print(f"Median paragraph length (chars): {df['text_b_len'].median()}")
print(f"Mean claim length (chars): {int(df['text_len'].mean())}")
print(f"Mean paragraph length (chars): {int(df['text_b_len'].mean())}")
print(f"Labels - 0, Non-novelty-destroying: {len(df[df['label'] == 0])}")
print(f"Labels - 1, Novelty-destroying: {len(df[df['label'] == 1])}")

In [None]:
# Save Train Data

df_train = df[['index', 'text', 'text_b', 'label']]
df_train.set_index('index', drop=True, inplace=True)
df_train.to_parquet('data/train_clean.parquet')

In [None]:
df_train.head()

### Test Data

In [None]:
df = pd.read_parquet('data/test.parquet')

# dropna and drop_duplicates
df = df.dropna(subset=['index', 'text', 'text_b', 'label']).drop_duplicates(subset=['text', 'text_b', 'label'])

# fix indexing
df['index'] = df['index'].apply(lambda x: int(x))

# change label datatype to int
# df['label'] = df['label'].apply(lambda x: int(x))

# rearrange columns order
df = df[[
        'index',
        'text',
        'text_b',
        'label',
        'claim_id',
        'patent_application_id',
        'cited_document_id',
        'date',
        'DIznQ_0',
        ]
        ]

# measure length in chars for text and text_b
df['text_len'] = df['text'].apply(lambda x: len(x))
df['text_b_len'] = df['text_b'].apply(lambda x: len(x))

In [None]:
# Train Dataset Information

print(f"Number of samples: {len(df)}")
print(f"Distinct patent applications: {df['patent_application_id'].nunique()}")
print(f"Distinct cited documents: {df['cited_document_id'].nunique()}")
print(f"Distinct claim texts: {df['text'].nunique()}")
print(f"Distinct cited paragraphs: {df['text_b'].nunique()}")
print(f"Median claim length (chars): {df['text_len'].median()}")
print(f"Median paragraph length (chars): {df['text_b_len'].median()}")
print(f"Mean claim length (chars): {int(df['text_len'].mean())}")
print(f"Mean paragraph length (chars): {int(df['text_b_len'].mean())}")
print(f"Labels - 0, Non-novelty-destroying: {len(df[df['label'] == 0])}")
print(f"Labels - 1, Novelty-destroying: {len(df[df['label'] == 1])}")

In [None]:
# Save Test Data

df_test = df[['index', 'text', 'text_b', 'label']]
df_test.set_index('index', drop=True, inplace=True)
df_test.to_parquet('data/test_clean.parquet')

In [None]:
df_test.head()

Check duplicated indexes

In [None]:
# check for duplicated index in Train and Test sets
check_indexes_list = df_train.index.tolist()
check_indexes_list.extend(df_test.index.tolist())

if len(check_indexes_list) == len(set(check_indexes_list)):
    print("No duplicated index found.")
else:
    print('Found duplicated index!')

## Fine-Tuning Experiment

In [None]:
import mlflow
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
from src.utils import timestamp
import torch
import numpy as np
from src.settings import (
    MLFLOW_EXPERIMENT_NAME,
    )


def logits_to_binary(logits, threshold: float=0.50):
    """
    Convert logits to probabilities using the sigmoid function and binarize on set threshold

    Args:
    logits (torch.Tensor or np.ndarray): Logits output from the model.
    threshold (float): default 0.50

    Returns:
    list: binary predictions
    """
    if isinstance(logits, np.ndarray):
        logits = torch.tensor(logits)

    probabilities = torch.sigmoid(logits)
    # print('prob:', probabilities)

    # Binarize the output using the threshold
    binary_predictions = [0 if x <= threshold else 1 for x in probabilities]
    # print('bin_pred:', binary_predictions)

    return binary_predictions

def compute_metrics(eval_pred):
    """
    Function to compute custom metrics: f1 score and matthews correlation.
    """
    # Load metrics
    f1_metric = evaluate.load("f1")
    mcc_metric = evaluate.load("matthews_correlation")

    # eval predictions
    logits, labels = eval_pred

    # logits = [x[0] for x in logits]
    predictions = logits_to_binary(logits)

    f1 = f1_metric.compute(predictions=predictions, references=labels)
    mcc = mcc_metric.compute(predictions=predictions, references=labels)

    return {
        "f1": f1["f1"],
        "mcc": mcc["matthews_correlation"]
    }


In [None]:
#@title Select the Model for Fine-Tuning
all_models = {
    'stsb-roberta-base': {'model': 'cross-encoder/stsb-roberta-base', 'tokenizer': 'cross-encoder/stsb-roberta-base'},
    'Legal-BERT': {'model': 'nlpaueb/legal-bert-base-uncased', 'tokenizer': 'nlpaueb/legal-bert-base-uncased'},
}

selection = 'stsb-roberta-base' #@param ['stsb-roberta-base', 'Legal-BERT']
selected_model = all_models[selection]


In [None]:
# set mlflow parameters and start the experiment
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
# # mlflow.start_run(experiment_id=experiment.experiment_id, log_system_metrics=True)
mlflow.set_tag(key='mlflow.runName', value=f"Training_{selected_model['model'].split('/')[1]}_{timestamp()}")


In [None]:
#@title Load and Train/Validation Split the Dataset
# loading train and test datasets
dataset = datasets.load_dataset("parquet", data_files={"train": "data/train_clean.parquet", "test": "data/test_clean.parquet"})

# split train into train and validation sets 20%
train_test_split = dataset['train'].train_test_split(test_size=0.20)

# rename temporary test from train for validation
train_test_split['validation'] = train_test_split.pop('test')

# full dataset: train, validation
dataset = datasets.DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['validation'],
    'test': dataset['test']
})


In [None]:
#@title Tokenize the Dataset
# init tokenizer
tokenizer = AutoTokenizer.from_pretrained(selected_model['tokenizer'])

def preprocess_function(batch):
    # Tokenize the pairs of texts
    inputs = tokenizer(
        batch['text'], batch['text_b'],
        padding='max_length',
        truncation=True,
        max_length=tokenizer.model_max_length,  # None == tokenizer.model_max_length
        return_tensors="pt",
        )
    inputs['label'] = batch['label']
    return inputs

# preprocess the data  # TODO check truncated for possible data loss in training
tokenized_dataset = dataset.map(preprocess_function, batched=True)


In [None]:
#@title Configure a Model
# set num_labels for selected model - cross-encoder support only 1 label
num_labels = 1

# init the model
model = AutoModelForSequenceClassification.from_pretrained(selected_model['model'], num_labels=num_labels)


In [None]:
#@title Set Training Arguments and Initialize Trainer
training_args = TrainingArguments(
    output_dir=f"./fine_tuning_results/{selected_model['model'].split('/')[1]}",
    num_train_epochs=5,
    per_device_train_batch_size=16,  # RTX 3090: 32
    per_device_eval_batch_size=32,  # RTX 3090: 128
    warmup_steps=20,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    learning_rate=2e-5,  # learning rate
    save_total_limit=5,  # limit the total amount of checkpoints, delete the older checkpoints
    logging_dir=f"./fine_tuning_results/{selected_model['model'].split('/')[1]}/logs",  # directory for storing logs
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=50,
)

# init trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)


In [None]:
#@title Start Training
# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# end experiment
mlflow.end_run()

In [None]:
#@title Save the Model
trainer.save_model(f"./saved_models/{selected_model['model'].split('/')[1]}_FT")


## Model Evaluation

In [None]:
import mlflow
from sklearn.metrics import matthews_corrcoef, f1_score
from src.utils import timestamp
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from transformers.pipelines.text_classification import ClassificationFunction
import torch

from src.settings import (
    MLFLOW_EXPERIMENT_NAME,
    )


# Check if a GPU is available and set the device
device = 0 if torch.cuda.is_available() else -1


In [None]:
#@title Load the Test Dataset
dataset_path = 'data/test_clean.parquet'
df_test = pd.read_parquet(dataset_path)

# X
sentence_pairs = list(zip(df_test['text'].tolist(),df_test['text_b'].tolist()))
# sentence pairs as list of dicts for transformer's pipeline
sentence_pairs_lods = [{"text": x[0], "text_pair": x[1]} for x in sentence_pairs]

# y_true
labels_true = df_test['label'].tolist()

In [None]:
#@title Select the Model for Evaluation
#@markdown Make sure to fine-tune and save the base model before selecting ***_FT** models
all_models = {
    'stsb-roberta-base': {'model': 'cross-encoder/stsb-roberta-base', 'tokenizer': 'cross-encoder/stsb-roberta-base'},
    'Legal-BERT': {'model': 'nlpaueb/legal-bert-base-uncased', 'tokenizer': 'nlpaueb/legal-bert-base-uncased'},
    # fine-tuned models below
    'stsb-roberta-base_FT': {'model': 'saved_models/stsb-roberta-base_FT', 'tokenizer': 'cross-encoder/stsb-roberta-base'},
    'Legal-BERT_FT': {'model': 'saved_models/legal-bert-base-uncased_FT', 'tokenizer': 'nlpaueb/legal-bert-base-uncased'},
}

selection = 'stsb-roberta-base_FT' #@param ['stsb-roberta-base', 'Legal-BERT', 'stsb-roberta-base_FT', 'Legal-BERT_FT']
selected_model = all_models[selection]

In [None]:
#@title Load the Model and Tokenizer

# set num_labels for selected model - cross-encoder support only 1 label
num_labels = 1
# load model
model = AutoModelForSequenceClassification.from_pretrained(selected_model['model'], num_labels=num_labels)
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(selected_model['tokenizer'])


In [None]:
#@title Start Evaluation
# init mlflow experiment (use existing one)
experiment = mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

# run experiment
with mlflow.start_run(experiment_id=experiment.experiment_id, log_system_metrics=True) as run:
    # set run name
    mlflow.set_tag(key='mlflow.runName',
                    value=f"Test_{selected_model['model'].split('/')[1]}_{timestamp()}")

    # log parameters
    mlflow.log_params({
        'PyTorch Device': torch.cuda.get_device_name(torch.cuda.current_device()),
        'Model': selected_model['model'],
        'Dataset': dataset_path,
        'AutoModel Parameters': model,
        'Tokenizer': tokenizer,
    })

    # run pipeline for model predictions
    pipe = pipeline("text-classification",
                    model = model,
                    tokenizer = tokenizer,
                    padding = True,
                    truncation = True,
                    # max_length = 512,
                    device = device,
                    function_to_apply = ClassificationFunction.SIGMOID,
                    top_k=1,  # return only top 1 predicted label with score
                    )

    predictions = pipe(sentence_pairs_lods)

    threshold = 0.50
    mlflow.log_metric("Threshold", threshold)
    labels_pred = [0 if x[0]['score'] <= threshold else 1 for x in predictions]

    f1_score_value = f1_score(y_true=labels_true, y_pred=labels_pred, pos_label=1, average='binary')
    mlflow.log_metric("F1 Score", f1_score_value)

    matthews_corrcoef_value = matthews_corrcoef(y_true=labels_true, y_pred=labels_pred)
    mlflow.log_metric("Matthews Correlation Coefficient", matthews_corrcoef_value)

    print(f"F1 Score: {f1_score_value}\nMatthews Correlation Coefficient: {matthews_corrcoef_value}")

# end experiment
mlflow.end_run()


## Finally: Check the metrics at MLFLOW server!