In [None]:
!pip install torch torchvision torchaudio
!pip install transformers scikit-learn

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
import pandas as pd
from transformers import DistilBertTokenizer
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertForSequenceClassification
from sklearn.utils.class_weight import compute_class_weight

print("Is GPU available?", torch.cuda.is_available())
print("CUDA Device Count:", torch.cuda.device_count())
!nvidia-smi

Is GPU available? True
CUDA Device Count: 1
Sat Feb  8 10:02:46 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   55C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
    

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

%cd gdrive/MyDrive/PW_DEEP

Mounted at /content/gdrive
/content/gdrive/MyDrive/PW_DEEP


Load data

In [None]:
df_train = pd.read_excel('train.xlsx')
df_val = pd.read_excel('val.xlsx')
df_test = pd.read_excel('test.xlsx')

In [None]:
X_train = df_train['text'].values
y_train = df_train['label'].values
X_val = df_val['text'].values
y_val = df_val['label'].values
X_test = df_test['text'].values
y_test = df_test['label'].values


cleaning of the dataset

In [None]:
# Replace non-string values with empty strings
X_train = [text if isinstance(text, str) else "" for text in X_train]
X_val = [text if isinstance(text, str) else "" for text in X_val]
X_test = [text if isinstance(text, str) else "" for text in X_test]

tokenizer

In [None]:
bert_model_name = 'distilbert-base-uncased'

tokenizer = DistilBertTokenizer.from_pretrained(bert_model_name)

def tokenize_texts(texts):
    return tokenizer(list(texts), padding=True, truncation=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_texts(X_train)
val_encodings = tokenize_texts(X_val)
test_encodings = tokenize_texts(X_test)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

dataloader for pytorch

In [None]:
# Create Tensor datasets
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(y_train))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(y_val))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(y_test))

# Create DataLoader instances for training and validation
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

Load the bert model for sequence classification

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(bert_model_name, num_labels=3)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


setup gpy and move model to gpu

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

Using device: cuda


compute class weights for the dataset unbalance towards the positive label

In [None]:
import numpy as np

# Convert the list to numpy array
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1, 2]), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print(class_weights)


tensor([4.1363, 4.1798, 0.3970], device='cuda:0')


training setup

In [None]:
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
loss_fn = CrossEntropyLoss(weight=class_weights)

In [None]:
def save_best_checkpoint(model, optimizer, epoch, train_losses, val_losses, checkpoint_path="best_checkpoint.pth"):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_losses': train_losses,
        'val_losses': val_losses
    }
    torch.save(checkpoint, checkpoint_path)
    print(f"Best checkpoint saved at epoch {epoch}")

def load_checkpoint(model, optimizer, checkpoint_path="best_checkpoint.pth"):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    train_losses = checkpoint['train_losses']
    val_losses = checkpoint['val_losses']
    print(f"Checkpoint loaded from epoch {epoch}")
    return epoch, train_losses, val_losses

def train_and_validate(model, train_dataloader, val_dataloader, optimizer, epochs=3, checkpoint_path="best_checkpoint.pth", patience=3):
    train_losses = []          # Averaged training losses per 200 steps
    val_losses = []            # Averaged validation losses per 50 steps
    epoch_train_losses = []    # Average training loss per epoch
    epoch_val_losses = []      # Average validation loss per epoch

    best_val_loss = float('inf')  # Set an initial best validation loss to infinity
    start_epoch = 0
    patience_counter = 0  # Counter to track the number of epochs without improvement

    # Check if checkpoint exists
    try:
        start_epoch, train_losses, val_losses = load_checkpoint(model, optimizer, checkpoint_path)
        best_val_loss = min(val_losses)  # Set the best_val_loss to the minimum of previous losses
    except FileNotFoundError:
        print("No checkpoint found, starting training from scratch.")

    for epoch in range(start_epoch, epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        model.train()

        train_step_buffer = []  # Collects losses for 200-step averages
        epoch_train_buffer = [] # Collects all losses for the epoch

        for step, batch in enumerate(train_dataloader):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            # Collect losses
            train_step_buffer.append(loss.item())
            epoch_train_buffer.append(loss.item())

            # Log every 200 steps
            if (step + 1) % 200 == 0:
                avg_train_loss = sum(train_step_buffer) / len(train_step_buffer)
                train_losses.append(avg_train_loss)
                print(f"\tStep {step + 1} - Avg Training Loss: {avg_train_loss:.4f}")
                train_step_buffer.clear()

        # Handle leftover steps (if not a multiple of 200)
        if train_step_buffer:
            avg_train_loss = sum(train_step_buffer) / len(train_step_buffer)
            train_losses.append(avg_train_loss)
            print(f"\tFinal Steps - Avg Training Loss: {avg_train_loss:.4f}")

        # Epoch Training Loss
        epoch_train_loss = sum(epoch_train_buffer) / len(epoch_train_buffer)
        epoch_train_losses.append(epoch_train_loss)

        # Validation Phase
        print("\t\tValidation phase")
        model.eval()
        val_step_buffer = []    # For averaging every 50 steps
        val_epoch_buffer = []   # For epoch average

        with torch.no_grad():
            for step, batch in enumerate(val_dataloader):
                input_ids, attention_mask, labels = [b.to(device) for b in batch]
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss = outputs.loss.item()

                val_step_buffer.append(val_loss)
                val_epoch_buffer.append(val_loss)

                # Log every 50 validation steps
                if (step + 1) % 50 == 0:
                    avg_val_loss = sum(val_step_buffer) / len(val_step_buffer)
                    val_losses.append(avg_val_loss)
                    print(f"\t\tValidation Step {step + 1} - Avg Validation Loss: {avg_val_loss:.4f}")
                    val_step_buffer.clear()

            # Handle leftover validation steps
            if val_step_buffer:
                avg_val_loss = sum(val_step_buffer) / len(val_step_buffer)
                val_losses.append(avg_val_loss)
                print(f"\t\tFinal Validation Steps - Avg Validation Loss: {avg_val_loss:.4f}")

        # Epoch Validation Loss
        epoch_val_loss = sum(val_epoch_buffer) / len(val_epoch_buffer) if val_epoch_buffer else 0
        epoch_val_losses.append(epoch_val_loss)

        print(f"---> End of Epoch {epoch + 1} - Training Loss: {epoch_train_loss:.4f} - Validation Loss: {epoch_val_loss:.4f}\n")

        # Save the best checkpoint based on validation loss
        if epoch_val_loss < best_val_loss:
            print(f"Validation loss improved from {best_val_loss:.4f} to {epoch_val_loss:.4f}. Best model so far! Saving checkpoint...")
            best_val_loss = epoch_val_loss
            save_best_checkpoint(model, optimizer, epoch + 1, train_losses, val_losses, checkpoint_path)
            patience_counter = 0
        else:
            patience_counter += 1

        # Early stopping if validation loss doesn't improve after a certain number of epochs (patience)
        if patience_counter >= patience:
            print(f"Validation loss hasn't improved for {patience} epochs. Stopping early at epoch {epoch + 1}.")
            break

    return train_losses, val_losses, epoch_train_losses, epoch_val_losses

In [None]:
train_losses, val_losses, epoch_train_losses, epoch_val_losses = train_and_validate(model, train_dataloader, val_dataloader, optimizer, epochs=EPOCHS)

Checkpoint loaded from epoch 1
Epoch 2/5
	Step 200 - Avg Training Loss: 0.0893
	Step 400 - Avg Training Loss: 0.1078
	Step 600 - Avg Training Loss: 0.1350
	Final Steps - Avg Training Loss: 0.1366
		Validation phase
		Validation Step 50 - Avg Validation Loss: 1.0600
		Validation Step 100 - Avg Validation Loss: 1.2118
		Validation Step 150 - Avg Validation Loss: 1.0453
		Validation Step 200 - Avg Validation Loss: 1.1119
		Final Validation Steps - Avg Validation Loss: 1.2523
---> End of Epoch 2 - Training Loss: 0.1157 - Validation Loss: 1.1161

Epoch 3/5
	Step 200 - Avg Training Loss: 0.0608
	Step 400 - Avg Training Loss: 0.0756
	Step 600 - Avg Training Loss: 0.0767
	Final Steps - Avg Training Loss: 0.1095
		Validation phase
		Validation Step 50 - Avg Validation Loss: 1.1220
		Validation Step 100 - Avg Validation Loss: 1.2830
		Validation Step 150 - Avg Validation Loss: 1.1065
		Validation Step 200 - Avg Validation Loss: 1.1767
		Final Validation Steps - Avg Validation Loss: 1.3252
---> E

In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

def plot_epoch_losses(epoch_train_losses, epoch_val_losses):
    # Create a subplot layout for the epoch losses
    fig = make_subplots(
        rows=1, cols=1,
        subplot_titles=("Epoch-wise Losses"),
    )

    # Plot Epoch-wise Training Losses
    fig.add_trace(
        go.Scatter(x=list(range(1, len(epoch_train_losses) + 1)), y=epoch_train_losses,
                   mode='lines+markers', name='Train Loss (per epoch)', line=dict(color='green')),
        row=1, col=1
    )

    # Plot Epoch-wise Validation Losses
    fig.add_trace(
        go.Scatter(x=list(range(1, len(epoch_val_losses) + 1)), y=epoch_val_losses,
                   mode='lines+markers', name='Validation Loss (per epoch)', line=dict(color='orange')),
        row=1, col=1
    )

    # Layout adjustments
    fig.update_layout(
        title="Epoch-wise Training and Validation Losses",
        xaxis_title="Epochs",
        yaxis_title="Loss",
        template="plotly_white",
        legend=dict(x=0.5, y=-0.2, orientation="h", xanchor="center"),
        height=500,
        width=800
    )

    # Show plot
    fig.show()

# Example usage after training
plot_epoch_losses(epoch_train_losses, epoch_val_losses)

In [None]:
# Evaluation
def evaluate(model, dataloader):
    model.eval()
    y_pred, y_true = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, axis=1)
            y_pred.extend(predictions.cpu().numpy())
            y_true.extend(labels.cpu().numpy())
    return y_true, y_pred

from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
def compute_metrics(y_true, y_pred, dataset_name):
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=1)  # Handle division by zero
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=1)  # Handle division by zero
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=1)  # Handle division by zero
    print(f"{dataset_name} - F1 Score: {round(f1,3)}, Accuracy: {round(accuracy,3)}, Recall: {round(recall,3)}, Precision: {round(precision,3)}")

# Example usage after evaluation
y_true_val, y_pred_val = evaluate(model, val_dataloader)
y_true_test, y_pred_test = evaluate(model, test_dataloader)

compute_metrics(y_true_val, y_pred_val, "Validation")
print()
compute_metrics(y_true_test, y_pred_test, "Test")

Validation - F1 Score: 0.767, Accuracy: 0.84, Recall: 0.84, Precision: 0.865

Test - F1 Score: 0.845, Accuracy: 0.85, Recall: 0.85, Precision: 0.842


In [None]:
from sklearn.metrics import confusion_matrix
import plotly.graph_objects as go
import numpy as np

def plot_confusion_matrix(y_true, y_pred, dataset_name):
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1, 2])  # For classes 0, 1, 2
    labels = ["Negative", "Neutral", "Positive"]

    fig = go.Figure(data=go.Heatmap(
        z=cm,
        x=labels,  # Predicted labels
        y=labels,  # True labels
        colorscale='Blues',
        showscale=True,
        hoverongaps=False,
        text=cm,  # Annotate with the counts
        texttemplate="%{text}"
    ))
    fig.update_layout(
        title=f"Confusion Matrix - {dataset_name}",
        xaxis_title="Predicted Label",
        yaxis_title="True Label"
        xaxis=dict(tickmode='array', tickvals=list(range(3)), ticktext=labels),
        yaxis=dict(tickmode='array', tickvals=list(range(3)), ticktext=labels),
        autosize=False,
        width=500,
        height=500
    )
    fig.show()

compute_metrics(y_true_val, y_pred_val, "Validation")
plot_confusion_matrix(y_true_val, y_pred_val, "Validation")

compute_metrics(y_true_test, y_pred_test, "Test")
plot_confusion_matrix(y_true_test, y_pred_test, "Test")


 Inference

In [None]:
def predict_sentiment(texts):
    encodings = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
    input_ids, attention_mask = encodings['input_ids'].to(device), encodings['attention_mask'].to(device)

    model.eval()
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask).logits
        predictions = torch.argmax(logits, axis=1).cpu().numpy()

    return predictions

# Example inference
texts = ["I love this!", "This is terrible.", "i think it's (very) bad"]
print(predict_sentiment(texts))

[2 0 2]
