In [5]:
import os
%pwd

'/home/petar/Documents/python_projects/petar-milivojevic-mlhU-machine-learning-new-mP1l/research'

In [6]:
os.chdir('../')
%pwd

'/home/petar/Documents/python_projects/petar-milivojevic-mlhU-machine-learning-new-mP1l'

In [10]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import json

data_path = 'artifacts/data_ingestion/CodeAid Source Codes Labeling.xlsx'
results_path = 'artifacts/train_results'
path = 'artifacts/data_ingestion/dataset-source-codes'
data_df = pd.read_excel(data_path)

In [11]:
labels = list(data_df['plagiarism_score'])
input_array = []

for i, row in data_df.iterrows():
    text = ' Question: '

    with open(os.path.join(path, row['coding_problem_id'], row['coding_problem_id'] + '.json'), 'r') as f:
        file = json.load(f)

    text += file['question'] + ' Candidate code: '

    for dir in os.listdir(os.path.join(path, row['coding_problem_id'])):
        if row['coding_problem_id'] + '.' in dir and 'json' not in dir:
            file_path = os.path.join(path, row['coding_problem_id'], dir)

            with open(file_path, 'r') as f:
                script_file = f.read()

            text += script_file + ' AI Code: '
            break

    for dir in os.listdir(os.path.join(path, row['coding_problem_id'])):
        if row['llm_answer_id'] in dir:
            file_path = os.path.join(path, row['coding_problem_id'], dir)
    
            with open(file_path, 'r') as f:
                script_file = f.read()

            text += script_file
            break
            
    input_array.append(text)

train_array, test_array, train_labels_array, test_labels_array = train_test_split(
    input_array,
    labels,
    test_size=0.14,
    random_state=42
)

In [12]:
data = pd.DataFrame(
    {
        'sample': train_array,
        'label': train_labels_array
    }
)

data.to_csv('train.csv')

In [4]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm.notebook import tqdm
from huggingface_hub import login

login(token='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')

In [5]:
class CustomBERTModel(nn.Module):
    def __init__(
            self,
            bert_model_name="bert-base-uncased",
            hidden_size=768,
            intermediate_dim=256,
            dropout_prob=0.3
        ):

        super(CustomBERTModel, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=1)
        self.fc1 = nn.Linear(hidden_size, intermediate_dim)
        self.fc2 = nn.Linear(intermediate_dim, 1)
        self.dropout = nn.Dropout(dropout_prob)
        self.activation = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        hidden_state = outputs.hidden_states[-1][:, 0, :]  # Use [CLS]-like token representation
        x = self.fc1(hidden_state)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.fc2(x)

        return x

In [6]:
class AICodeDataset(Dataset):
    def __init__(self, input_data, labels, tokenizer, max_len=512):
        self.input_data = input_data
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_sample = self.input_data[idx]
        label = self.labels[idx]

        inputs = self.tokenizer(
            input_sample,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.float),
        }

In [7]:
def plot_metric(metric_path, range, train_matric, val_matric, train_label, val_label, fold, threshold):
    plt.figure(figsize=(12, 6))
    plt.plot(range, train_matric, label=train_label)
    plt.plot(range, val_matric, label=val_label)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title(f'Train/validation Loss for Fold {fold + 1}')
    plt.legend()
    plt.savefig(os.path.join(metric_path, f'Train_Val_{str.split(train_label)[-1]}_{fold + 1}_thr_{threshold}.png'))

def plot_confusion_matrix(conf_matrix, cm_path, fold, threshold, title):
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.savefig(os.path.join(cm_path, f'cm_fold_{fold + 1}_thr_{threshold}.png'))

def plot_roc_curve(labels, predictions, roc_curve_path, fold, threshold):
    fpr, tpr, _ = roc_curve(labels, predictions)
    plt.figure()
    plt.plot(fpr, tpr, label="ROC Curve")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.savefig(os.path.join(roc_curve_path, f'roc_curve_fold_{fold + 1}_thr_{threshold}.png'))

In [8]:
def validation(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].unsqueeze(1).to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.sigmoid(outputs).cpu().detach().numpy().flatten()
            val_predictions.extend(preds)
            val_labels.extend(labels.cpu().numpy().flatten())
    
    return total_loss / len(val_loader), val_predictions, val_labels

In [19]:
def train(train_array, train_labels_array, tokenizer, num_folds, threshold, batch_size, lr, num_epochs):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('Device: ', device)

    dataset = AICodeDataset(train_array, train_labels_array, tokenizer)

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    for fold, (train_idx, val_idx) in tqdm(enumerate(kf.split(dataset))):
        print(f'Fold {fold + 1}/{num_folds}')

        fold_path = os.path.join(results_path, f'fold_{fold}')
        os.makedirs(fold_path, exist_ok=True)

        train_subset = Subset(dataset, train_idx)
        val_subset = Subset(dataset, val_idx)

        train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=True)

        model = CustomBERTModel().to(device)
        optimizer = AdamW(model.parameters(), lr=lr)
        criterion = nn.BCEWithLogitsLoss()

        train_losses = []
        val_losses = []
        train_accuracies = []
        val_accuracies = []
        best_val_loss = float('inf')
        epochs_range = range(1, num_epochs + 1)
        
        for epoch in tqdm(range(num_epochs)):
            total_loss = 0
            train_preds = []
            train_labels = []

            for batch in tqdm(train_loader):
                optimizer.zero_grad()
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].unsqueeze(1).to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)

                preds = torch.sigmoid(outputs).squeeze().cpu().detach().numpy()
                train_preds.extend(preds)
                train_labels.extend(labels.cpu().numpy())

                binary_preds = [1 if pred >= threshold else 0 for pred in train_preds]
                binary_labels = [1 if label >= threshold else 0 for label in train_labels]

                accuracy = accuracy_score(binary_labels, binary_preds)

                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

                sys.stdout.write("train_loss:%.4f - train_accuracy:%.4f" %(loss.item(), accuracy))
                sys.stdout.flush()
                print()
            
            train_epoch_loss = total_loss / len(train_loader)
            train_losses.append(train_epoch_loss)

            val_epoch_loss, val_preds, val_labels = validation(model, val_loader, criterion, device)
            val_losses.append(val_epoch_loss)
            
            model_path = os.path.join(fold_path, 'model')
            os.makedirs(model_path, exist_ok=True)

            figures_path = os.path.join(fold_path, 'figures')
            os.makedirs(figures_path, exist_ok=True)

            binary_train_preds = [1 if pred >= threshold else 0 for pred in train_preds]
            binary_train_labels = [1 if label >= threshold else 0 for label in train_labels]

            train_accuracy = accuracy_score(binary_train_labels, binary_train_preds)
            train_accuracies.append(train_accuracy)

            binary_val_preds = [1 if pred >= threshold else 0 for pred in val_preds]
            binary_val_labels = [1 if label >= threshold else 0 for label in val_labels]

            val_accuracy = accuracy_score(binary_val_labels, binary_val_preds)
            val_accuracies.append(val_accuracy)

            print(f'Epoch [{epoch+1}/{num_epochs}], '
                  f'Loss: {train_epoch_loss:.4f}, '
                  f'Validation Loss: {val_epoch_loss:.4f}, '
                  f'Train Accuracy: {train_accuracy:.2f}%, '
                  f'Validation Accuracy: {val_accuracy:.2f}%')

            if val_epoch_loss < best_val_loss:
                torch.save(model.state_dict(), os.path.join(model_path, f'model_{fold}.pth'))
                tokenizer.save_pretrained(model_path)

                report = classification_report(binary_val_labels, binary_val_preds, zero_division=0, output_dict=True)

                with open(os.path.join(figures_path, 'metrics_report_fold_{fold}_thr_{threshold}.json'), "w") as f:
                    json.dump(report, f, indent=4)

                conf_matrix = confusion_matrix(binary_val_labels, binary_val_preds)

                plot_confusion_matrix(
                    conf_matrix,
                    figures_path,
                    fold,
                    threshold,
                    'Confusion Matrix for Folt {fold} and Threshold {threshold}'
                )

                auc_score = roc_auc_score(binary_val_labels, val_preds)
                auc_score_dict = {'auc_score': auc_score}

                with open(os.path.join(figures_path, 'auc_score_fold_{fold}_thr_{threshold}.json'), "w") as f:
                    json.dump(auc_score_dict, f, indent=4)
                
                plot_roc_curve(
                    binary_val_labels,
                    binary_val_preds,
                    figures_path,
                    fold,
                    threshold
                )
        
        plot_metric(
            figures_path,
            epochs_range,
            train_losses,
            val_losses,
            'Train Loss',
            'Validation Loss',
            fold,
            threshold
        )

        plot_metric(
            figures_path,
            epochs_range,
            train_accuracies,
            val_accuracies,
            'Train Accuracies',
            'Validation Accuracies',
            fold,
            threshold
        )

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
num_folds = 5
threshold = 0.5
batch_size = 16
lr = 2e-5
num_epochs = 50

train(train_array, train_labels_array, tokenizer, num_folds, threshold, batch_size, lr, num_epochs)

In [21]:
test_dsataset = AICodeDataset(test_array, test_labels_array, tokenizer)
test_loader = DataLoader(test_dsataset, batch_size=batch_size, shuffle=False)
criterion = nn.BCEWithLogitsLoss()
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = CustomBERTModel().to(device)
model.load_state_dict(torch.load('artifacts/train_results/fold_0/model/model_0.pth'))

test_loss, test_preds, test_labels = validation(model, test_loader, criterion, device)

binary_test_preds = [1 if pred >= threshold else 0 for pred in test_preds]
binary_test_labels = [1 if label >= threshold else 0 for label in test_labels]

test_accuracy = accuracy_score(binary_test_labels, binary_test_preds)

report = classification_report(binary_test_labels, binary_test_preds, zero_division=0, output_dict=True)

conf_matrix = confusion_matrix(binary_test_labels, binary_test_preds)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load('artifacts/train_results/fold_0/model/model_0.pth'))


  0%|          | 0/4 [00:00<?, ?it/s]

In [23]:
report

{'0': {'precision': 0.9090909090909091,
  'recall': 1.0,
  'f1-score': 0.9523809523809523,
  'support': 40.0},
 '1': {'precision': 1.0,
  'recall': 0.6923076923076923,
  'f1-score': 0.8181818181818182,
  'support': 13.0},
 'accuracy': 0.9245283018867925,
 'macro avg': {'precision': 0.9545454545454546,
  'recall': 0.8461538461538461,
  'f1-score': 0.8852813852813852,
  'support': 53.0},
 'weighted avg': {'precision': 0.9313893653516294,
  'recall': 0.9245283018867925,
  'f1-score': 0.9194641836151269,
  'support': 53.0}}