In [None]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from huggingface_hub import login

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

from dotenv import load_dotenv

In [None]:
load_dotenv()

login() #Hugginface login

### Load Data

In [None]:
train_df = pd.read_csv("../data/transformed/cleaned_train_data.csv")
train_df

In [None]:
test_df = pd.read_csv("../data/transformed/cleaned_test_data.csv")
test_df

We need to define an instruction(prompt).

In [None]:
# Define the prompt template
prompt_template = (
    "You are a skillful patent examiner with over 20 years of experience. "
    "You are reviewing patent prior art. Examine if the following chunk of a patent description constitutes prior art for the given claim.\n"
    "DESCRIPTION: {text_b}\n"
    "CLAIM: {text}"
)

# Create the prompt column in both datasets
train_df['prompt'] = train_df.apply(lambda row: prompt_template.format(text_b=row['text_b'], text=row['text']), axis=1)
test_df['prompt'] = test_df.apply(lambda row: prompt_template.format(text_b=row['text_b'], text=row['text']), axis=1)

train_df.to_csv('../data/transformed/train_with_prompt.csv', index=False)
test_df.to_csv('../data/transformed/test_with_prompt.csv', index=False)

In [None]:
pd.set_option('display.max_colwidth', None)

# Inspect the first row of the new DataFrame to verify the prompt column
train_row = train_df.iloc[0]
test_row = test_df.iloc[0]

print("First row of the training data with prompt:")
print(train_row)

print("First row of the test data with prompt:")
print(test_row)

Let's split the train set to have a small validation set during training, since we have little data I will split only 10 %. I will also make a stratified split to have the same distribution of the training data.

In [None]:
train_split_df, val_split_df = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df['label'])

# Check the distribution of the target variable in both splits
print("\nDistribution of target variable in training split:")
print(train_split_df['label'].value_counts(normalize=True))
print("\nDistribution of target variable in validation split:")
print(val_split_df['label'].value_counts(normalize=True))

Now I want to convert the data to a Hugginface Dataset and store it into my account.

In [None]:
# Convert Pandas DataFrames to Huggingface Datasets
train_dataset = Dataset.from_pandas(train_split_df)
val_dataset = Dataset.from_pandas(val_split_df)
test_dataset = Dataset.from_pandas(test_df)

# Merge the datasets into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

dataset_dict

In [None]:
# Save Dataset
dataset_dict.push_to_hub("patentmatch_exp")

In [None]:
print("hello")