In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoConfig, DebertaV2Config, DebertaV2ForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
import os
import gc
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
import numpy as np
import matplotlib.pyplot as plt

In [None]:
model_name= 'microsoft/deberta-v3-base'

In [None]:
def load_datasets(train_path,test_path,val_path):

  """loads train,test and val dataframes from csv path then converts it to Hugging Face datasets"""

  train_df=pd.read_csv(train_path)
  test_df=pd.read_csv(test_path)
  val_df=pd.read_csv(val_path)

  train_dataset=Dataset.from_pandas(train_df)
  test_dataset=Dataset.from_pandas(test_df)
  val_dataset=Dataset.from_pandas(val_df)

  return train_dataset,test_dataset,val_dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

NameError: name 'AutoTokenizer' is not defined

In [None]:
def tokenize(example):
  return tokenizer(example['review'], truncation=True, padding="max_length", max_length=256)

In [None]:
def tokenize_datasets(train_dataset, val_dataset, test_dataset, tokenize_function):

    """Tokenizes the train, validation, and test datasets.

    """
    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_val = val_dataset.map(tokenize_function, batched=True)
    tokenized_test = test_dataset.map(tokenize_function, batched=True)

    return tokenized_train, tokenized_val, tokenized_test

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
os.environ.get('CUDA_VISIBLE_DEVICES')
print(torch.__version__)
print(torch.version.cuda)

In [None]:
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"Current CUDA device: {torch.cuda.current_device()}")

In [None]:
def clear_gpu_memory():
    '''
    Free GPU memory
    '''
    # Trigger Python garbage collection
    gc.collect()
    # Clear PyTorch's CUDA cache
    torch.cuda.empty_cache()

clear_gpu_memory()

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    mcc = matthews_corrcoef(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mcc': mcc,
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=256,
    learning_rate= 2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    label_smoothing_factor=0.1,
    lr_scheduler_type='cosine',
    adam_beta1=0.9,
    adam_beta2=0.98,
    warmup_ratio=0.1,
    run_name="model40_finetuning",
    report_to='wandb',
    fp16=True
)

In [None]:
def train_model(model, training_args, tokenized_train, tokenized_val, compute_metrics):
    """Loading trainer

    """
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.01)]
    )

    trainer.train()

    return trainer

In [None]:
def evaluate_model(trainer, dataset):

  results=trainer.evaluate(eval_dataset=dataset)
  print(results)

  return results

In [None]:
def plot_trainer_logs(trainer):
    """
    Extracts training and validation loss from trainer log history and plots them.

    Args:
        trainer: Hugging Face Trainer object
    """
    logs = trainer.state.log_history

    # Extract losses and corresponding steps
    train_losses = [entry["loss"] for entry in logs if "loss" in entry]
    eval_losses = [entry["eval_loss"] for entry in logs if "eval_loss" in entry]
    steps = [entry["step"] for entry in logs if "loss" in entry]
    eval_steps = [entry["step"] for entry in logs if "eval_loss" in entry]

    # Plot training and validation loss
    plt.figure(figsize=(10, 6))
    plt.plot(steps, train_losses, label="Training Loss", marker="o")
    plt.plot(eval_steps, eval_losses, label="Validation Loss", marker="x")
    plt.xlabel("Training Steps")
    plt.ylabel("Loss")
    plt.title("Training and Validation Loss")
    plt.legend()
    plt.grid()
    plt.show()

    # Convert logs to DataFrame
    df = pd.DataFrame(logs)

    # Plot training loss separately
    df_loss = df[df['loss'].notna()]
    plt.figure(figsize=(10, 6))
    plt.plot(df_loss['step'], df_loss['loss'], label='Training Loss')
    plt.xlabel('Training Steps')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.legend()
    plt.show()

    # Plot validation loss separately
    df_eval = df[df['eval_loss'].notna()]
    plt.figure(figsize=(10, 6))
    plt.plot(df_eval['step'], df_eval['eval_loss'], label='Validation Loss', color='orange')
    plt.xlabel('Training Steps')
    plt.ylabel('Loss')
    plt.title('Validation Loss')
    plt.legend()
    plt.show()

In [None]:
train_path='/content/drive/MyDrive/Datasets/Arts_Crafts_and_Sewing_train_1.csv'
test_path='/content/drive/MyDrive/Datasets/Arts_Crafts_and_Sewing_test.csv'
val_path='/content/drive/MyDrive/Datasets/Arts_Crafts_and_Sewing_val.csv'

In [None]:
train_dataset,test_dataset,val_dataset=load_datasets(train_path=train_path,test_path=test_path,val_path=val_path)

In [None]:
tokenized_train, tokenized_val, tokenized_test=tokenize_datasets(train_dataset=train_dataset, val_dataset=val_dataset,
                                                                 test_dataset=test_dataset, tokenize_function=tokenize)

In [None]:
model = DebertaV2ForSequenceClassification.from_pretrained(model_name, hidden_dropout_prob=0.1,
                                                            attention_probs_dropout_prob=0.1, num_labels=3)

In [None]:
trainer=train_model(model=model, training_args=training_args,
                    tokenized_train=tokenzied_train, tokenized_val=tokenized_val, compute_metrics=compute_metrics)

In [None]:
plot_trainer_logs(trainer=trainer)