### Fine-Tuning Regression Model Notebook

This notebook is designed to fine-tune a text classification model for a regression task, predicting the AI-generated code score (ranging between 0 and 1).

- **Optional**: Since the fine-tuned model has already been uploaded to my Hugging Face hub (username: `wasabibish`), running this notebook is not mandatory.

In [1]:
import pandas as pd

import datasets
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer , TrainingArguments, Trainer, DataCollatorWithPadding

from sklearn.metrics import mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

## Data preparation

In [2]:
data = pd.read_csv('data.csv')

In [3]:
# create a dataframe for training data, with the text and label columns
training_data = pd.DataFrame(columns=['text', 'label'])

# the text column will contain the question and answer text
training_data['text'] =  'Question :\n' + data['question'] + '\nAnswer :\n' + data['human_content']
training_data['label'] = data['plagiarism_score']

In [4]:
# create a dataset object from the dataframe
training_data = datasets.Dataset.from_pandas(training_data)

# Train

In [5]:
model_name = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'

In [None]:
# load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, ignore_mismatched_sizes=True)

In [7]:
# function to tokenize the text and return the input_ids and attention_mask
# setting truncation and padding to True ensures that all the sequences have the same length
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

In [None]:
tokenized_datasets = training_data.map(preprocess_function, batched=True)

In [9]:
# split data into training and validation sets
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.1, seed=42)

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Training arguments

In [11]:
# define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    eval_steps=50
)

In [12]:
# create a Trainer instancE
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
# train the model
trainer.train()

## Evaluation

In [None]:
# evaluate the model on the test set
trainer.evaluate()

In [None]:
# get predictions on the test set
predictions = trainer.predict(tokenized_datasets["test"])

In [132]:
# get real labels
true_classes = tokenized_datasets["test"]['label']

In [None]:
# calculate mean absolute error and mean squared error qs metrics
mean_absolute_error(true_classes, predictions.label_ids), mean_squared_error(true_classes, predictions.label_ids)

In [None]:
# push the fine tuned model to the hub

model.push_to_hub('plagiarism-detection')

# Inference

In [237]:
def inferece(question, answer=None):
    """"
    Function to make predictions on new data

    Parameters:
    -----------
    question : str
        The question text (coding problem)
    answer : str
        The answer text (solution, given code)

    Returns:
    --------
    float
        The predicted plagiarism score
    """
    if answer is None:
        text = question
    else:
        text = 'Question :\n' + question + '\nAnswer :\n' + answer
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    return logits.item()

In [None]:
inferece(tokenized_datasets["test"][0]['text'])