In [None]:
!pip install transformers -U
!pip install accelerate -U
!pip install optuna -U
!pip install -U flash-attn --no-build-isolation #install flash attention 2

import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModel
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import nltk
import re
import optuna
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import time

# Load the dataset
from google.colab import drive
drive.mount('/content/drive')
java_file_path = '/content/drive/MyDrive/Wajid Ali/FYP/Dataset/java.csv'
python_file_path = '/content/drive/MyDrive/Wajid Ali/FYP/Dataset/python.csv'
pharo_file_path = '/content/drive/MyDrive/Wajid Ali/FYP/Dataset/pharo.csv'
java_df = pd.read_csv(java_file_path)
python_df = pd.read_csv(python_file_path)
pharo_df = pd.read_csv(pharo_file_path)

# Data preparation
java_df['category'] = java_df['category'].replace('Expand', 'java_Expand')
df = pd.concat([java_df, python_df, pharo_df], ignore_index=True)

# Preprocessing of the dataset
def preprocess_text(text):
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['combo'] = df['class'] + " " + df['comment_sentence']
df['combo'] = df['combo'].apply(preprocess_text)

# Split the dataset into training and testing data
train_data = df[df['partition'] == 1]
test_data = df[df['partition'] == 0]

# Tokenization and Dataset preparation
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)

class CommentDataset(Dataset):
    def __init__(self, combos, labels, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.combos = combos
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.combos)

    def __getitem__(self, idx):
        combo = str(self.combos[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            combo,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = CommentDataset(train_data['combo'].to_numpy(), train_data['category'].astype('category').cat.codes.to_numpy(), tokenizer)
test_dataset = CommentDataset(test_data['combo'].to_numpy(), test_data['category'].astype('category').cat.codes.to_numpy(), tokenizer)

def objective(trial):
    # Hyperparameter optimization with Optuna
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
    num_train_epochs = trial.suggest_int('num_train_epochs', 2, 4)
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8, 16, 32])

    args = TrainingArguments(
        output_dir='./results',
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        evaluation_strategy='steps',
        logging_dir='./logs',
        logging_steps=10
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )
    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result['eval_loss']

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=3)

best_trial = study.best_trial
for key, value in best_trial.params.items():
    print(f"{key}: {value}")

# Use best hyperparameters to configure training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=best_trial.params['num_train_epochs'],
    per_device_train_batch_size=best_trial.params['per_device_train_batch_size'],
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy='steps',
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=best_trial.params['learning_rate']
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()