# **Original Code for Extremely Long Processing**

In [2]:
!pip install trl

Collecting trl
  Downloading trl-0.16.0-py3-none-any.whl.metadata (12 kB)
Collecting datasets>=3.0.0 (from trl)
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=3.0.0->trl)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=3.0.0->trl)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=3.0.0->trl)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets>=3.0.0->trl)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accel

In [4]:
import torch
import numpy as np
import random
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import RobertaTokenizer, RobertaModel
from trl import GRPOConfig
import os
import torch.nn as nn
import torch.nn.functional as F
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
from collections import Counter

# Set seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

class UniversalSpamClassifier(nn.Module):
    def __init__(self, hidden_size=768):
        super(UniversalSpamClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.classifier = nn.Linear(hidden_size, 2)  # Binary classification
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

def preprocess_text(text, is_sms=False):
    """Universal text preprocessing for both SMS and email"""
    if pd.isna(text):
        return ""

    # Convert to string if not already
    text = str(text)

    # Common preprocessing for both SMS and email
    text = text.lower()

    # Specific preprocessing for SMS
    if is_sms:
        # Replace email addresses
        text = re.sub(r'\S+@\S+', 'emailaddr', text)
        # Replace URLs
        text = re.sub(r'http\S+|www\S+|https\S+', 'webaddress', text, flags=re.MULTILINE)
        # Replace money symbols
        text = re.sub(r'£|\$', 'moneysymb', text)
        # Replace phone numbers
        text = re.sub(r'[\+\d\-\s]{10,}', 'phonenum', text)
        # Replace numbers
        text = re.sub(r'\d+(\.\d+)?', 'num', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    words = [w for w in words if w not in stop_words]

    # Stemming
    ps = PorterStemmer()
    words = [ps.stem(w) for w in words]

    return ' '.join(words)

def load_and_preprocess_data(file_path, is_sms=False, sample_size=None):
    """Load and preprocess data from CSV file"""
    df = pd.read_csv(file_path)

    if sample_size:
        df = df.sample(n=min(sample_size, len(df)), random_state=42).reset_index(drop=True)

    # Handle different CSV formats
    if 'label_num' in df.columns:
        labels = df['label_num'].tolist()
    elif 'v1' in df.columns:  # SMS spam dataset format
        labels = (df['v1'] == 'spam').astype(int).tolist()
    else:
        raise ValueError("Could not determine label column in dataset")

    # Get text column
    if 'text' in df.columns:
        texts = df['text'].tolist()
    elif 'v2' in df.columns:  # SMS spam dataset format
        texts = df['v2'].tolist()
    else:
        raise ValueError("Could not determine text column in dataset")

    # Preprocess texts
    preprocessed_texts = [preprocess_text(text, is_sms) for text in texts]

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        preprocessed_texts, labels, test_size=0.2, random_state=42
    )

    return X_train, X_test, y_train, y_test

class SpamEnvironment:
    def __init__(self, model, tokenizer, texts, labels):
        self.model = model
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.current_idx = 0

    def step(self, action):
        true_label = self.labels[self.current_idx]

        if action.argmax().item() == true_label:
            reward = 1.0
        else:
            reward = -1.0

        self.current_idx = (self.current_idx + 1) % len(self.texts)
        return torch.tensor([reward], device=device)

    def reset(self):
        self.current_idx = random.randint(0, len(self.texts) - 1)
        return self.get_current_input()

    def get_current_input(self):
        text = self.texts[self.current_idx]
        encoding = self.tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
        return {k: v.to(device) for k, v in encoding.items()}

class UniversalSpamTrainer:
    def __init__(self, model, tokenizer, train_texts, train_labels, test_texts, test_labels):
        self.model = model.to(device)
        self.tokenizer = tokenizer
        self.train_env = SpamEnvironment(model, tokenizer, train_texts, train_labels)
        self.test_env = SpamEnvironment(model, tokenizer, test_texts, test_labels)

        self.ppo_config = GRPOConfig(
            learning_rate=1e-5,
            gradient_accumulation_steps=1,
            seed=42,
            output_dir='./'
        )

        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.ppo_config.learning_rate)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'max', patience=2)
        self.train_losses = []
        self.accuracies = []
        self.spam_probs = []

    def train(self, epochs=10, eval_freq=1):
        best_accuracy = 0.0

        for epoch in range(epochs):
            print(f"Epoch {epoch+1}/{epochs}")
            epoch_losses = []
            epoch_spam_probs = []

            for i in tqdm(range(len(self.train_env.texts))):
                inputs = self.train_env.get_current_input()
                inputs = {k: v.to(device) for k, v in inputs.items()}

                logits = self.model(input_ids=inputs['input_ids'],
                                   attention_mask=inputs['attention_mask'])
                probs = F.softmax(logits, dim=-1)
                action = torch.multinomial(probs, 1)

                reward = self.train_env.step(probs)

                log_prob = torch.log(probs.gather(1, action))
                loss = -log_prob * reward.to(device)
                loss = loss.mean()

                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)
                self.optimizer.step()

                epoch_losses.append(loss.item())
                epoch_spam_probs.append(probs[0, 1].item())  # Probability of spam
                self.train_env.current_idx = (self.train_env.current_idx + 1) % len(self.train_env.texts)

            avg_loss = sum(epoch_losses) / len(epoch_losses) if epoch_losses else 0
            self.train_losses.append(avg_loss)
            avg_spam_prob = sum(epoch_spam_probs) / len(epoch_spam_probs) if epoch_spam_probs else 0
            self.spam_probs.append(avg_spam_prob)

            print(f"Average training loss: {avg_loss:.4f}")
            print(f"Average spam probability in training: {avg_spam_prob:.4f}")

            if (epoch + 1) % eval_freq == 0:
                accuracy = self.evaluate()
                self.accuracies.append(accuracy)
                self.scheduler.step(accuracy)

                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    torch.save(self.model.state_dict(), "best_universal_spam_classifier.pt")
                    print(f"Saved new best model with accuracy: {accuracy:.4f}")

        self.model.load_state_dict(torch.load("best_universal_spam_classifier.pt"))
        return self.model

    def evaluate(self):
        self.model.eval()
        correct = 0
        total = 0
        predictions = []
        true_labels = []
        spam_probs = []

        with torch.no_grad():
            for i in range(len(self.test_env.texts)):
                text = self.test_env.texts[i]
                label = self.test_env.labels[i]

                inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
                inputs = {k: v.to(device) for k, v in inputs.items()}

                logits = self.model(**inputs)
                probs = F.softmax(logits, dim=-1)
                predicted = logits.argmax(-1).item()

                correct += (predicted == label)
                total += 1
                predictions.append(predicted)
                true_labels.append(label)
                spam_probs.append(probs[0, 1].item())  # Probability of spam

        accuracy = correct / total
        print(f"Test accuracy: {accuracy:.4f}")

        # Generate classification report
        report = classification_report(true_labels, predictions, target_names=['Not Spam', 'Spam'], digits=4)
        print("\n=== Classification Report ===")
        print(report)

        # Create confusion matrix
        cm = confusion_matrix(true_labels, predictions)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=['Not Spam', 'Spam'],
                   yticklabels=['Not Spam', 'Spam'])
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')
        plt.savefig('confusion_matrix_universal_spam.png')
        plt.close()

        # Plot spam probability distribution
        self.plot_spam_distribution(spam_probs)

        self.model.train()
        return accuracy

    def plot_spam_distribution(self, spam_probs):
        """Plot pie chart of spam vs not spam probabilities"""
        avg_spam_prob = np.mean(spam_probs)
        avg_ham_prob = 1 - avg_spam_prob

        plt.figure(figsize=(8, 6))
        plt.pie([avg_ham_prob, avg_spam_prob],
                labels=['Not Spam', 'Spam'],
                autopct='%1.1f%%',
                colors=['lightgreen', 'lightcoral'],
                startangle=90)
        plt.title('Average Spam Probability Distribution')
        plt.savefig('spam_distribution_pie.png')
        plt.close()

        # Create probability ranking
        self.create_probability_ranking(spam_probs)

    def create_probability_ranking(self, spam_probs):
        """Create and display a ranking of messages by spam probability"""
        ranked_indices = np.argsort(spam_probs)[::-1]  # Sort descending
        top_spam = ranked_indices[:5]
        top_ham = ranked_indices[-5:][::-1]

        print("\n=== Top Spam Predictions ===")
        for i in top_spam:
            text = self.test_env.texts[i][:100] + "..." if len(self.test_env.texts[i]) > 100 else self.test_env.texts[i]
            print(f"Prob: {spam_probs[i]:.4f} - {text}")

        print("\n=== Top Not Spam Predictions ===")
        for i in top_ham:
            text = self.test_env.texts[i][:100] + "..." if len(self.test_env.texts[i]) > 100 else self.test_env.texts[i]
            print(f"Prob: {spam_probs[i]:.4f} - {text}")

    def plot_learning_curves(self):
        plt.figure(figsize=(15, 5))

        # Plot training loss
        plt.subplot(1, 3, 1)
        plt.plot(self.train_losses)
        plt.title('Training Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')

        # Plot test accuracy
        plt.subplot(1, 3, 2)
        plt.plot(self.accuracies)
        plt.title('Test Accuracy')
        plt.xlabel('Evaluation')
        plt.ylabel('Accuracy')

        # Plot spam probability trend
        plt.subplot(1, 3, 3)
        plt.plot(self.spam_probs)
        plt.title('Average Spam Probability')
        plt.xlabel('Epoch')
        plt.ylabel('Probability')

        plt.tight_layout()
        plt.savefig('learning_curves_universal_spam.png')
        plt.close()

    def predict(self, text, is_sms=False):
        """Predict whether text is spam or not"""
        self.model.eval()
        preprocessed_text = preprocess_text(text, is_sms)

        with torch.no_grad():
            inputs = self.tokenizer(preprocessed_text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            logits = self.model(**inputs)
            probs = F.softmax(logits, dim=-1)
            prediction = logits.argmax(-1).item()
            spam_prob = probs[0, 1].item()

        self.model.train()
        return {
            "prediction": "Spam" if prediction == 1 else "Not Spam",
            "spam_probability": spam_prob,
            "confidence": max(probs[0, 0].item(), probs[0, 1].item())
        }

def main():
    # You can use either email or SMS dataset here
    email_path = "/content/spam_ham_dataset.csv"
    sms_path = "../content/spam.csv"

    print("Loading and preprocessing data...")
    # For email data
    X_train, X_test, y_train, y_test = load_and_preprocess_data(email_path, is_sms=False, sample_size=2000)

    # For SMS data (uncomment to use)
    # X_train, X_test, y_train, y_test = load_and_preprocess_data(sms_path, is_sms=True, sample_size=2000)

    print(f"Training on {len(X_train)} examples, testing on {len(X_test)} examples")

    # Initialize model
    model = UniversalSpamClassifier()

    # Create trainer
    trainer = UniversalSpamTrainer(
        model=model,
        tokenizer=tokenizer,
        train_texts=X_train,
        train_labels=y_train,
        test_texts=X_test,
        test_labels=y_test
    )

    # Train model
    print("Training universal spam classifier...")
    trainer.train(epochs=1, eval_freq=1)

    # Plot learning curves
    trainer.plot_learning_curves()

    # Display plots
    import matplotlib.image as mpimg

    plt.figure(figsize=(15, 10))

    # Learning curves
    plt.subplot(2, 2, 1)
    img = mpimg.imread('learning_curves_universal_spam.png')
    plt.imshow(img)
    plt.axis('off')
    plt.title('Learning Curves')

    # Confusion matrix
    plt.subplot(2, 2, 2)
    img = mpimg.imread('confusion_matrix_universal_spam.png')
    plt.imshow(img)
    plt.axis('off')
    plt.title('Confusion Matrix')

    # Spam distribution
    plt.subplot(2, 2, 3)
    img = mpimg.imread('spam_distribution_pie.png')
    plt.imshow(img)
    plt.axis('off')
    plt.title('Spam Distribution')

    plt.tight_layout()
    plt.show()

    # Test with example messages
    test_messages = [
        ("Congratulations! You've won a free iPhone! Click here to claim now!", True),
        ("Meeting tomorrow at 10 AM. Please prepare the quarterly report.", False),
        ("URGENT: Your account has been compromised. Verify details now!", True),
        ("Hi John, just checking in to see how you're doing.", False),
        ("FREE entry to win £1000 prize! TEXT WIN to 12345 now!", True),
        ("Your package will be delivered tomorrow between 2-4pm.", False)
    ]

    print("\n=== Testing Universal Spam Classifier ===")
    results = []
    for msg, is_sms in test_messages:
        result = trainer.predict(msg, is_sms)
        results.append({
            "Message": msg,
            "Type": "SMS" if is_sms else "Email",
            "Prediction": result["prediction"],
            "Spam Probability": f"{result['spam_probability']:.4f}",
            "Confidence": f"{result['confidence']:.4f}"
        })

    # Display results as a table
    results_df = pd.DataFrame(results)
    print("\nTest Message Results:")
    print(results_df.to_string(index=False))

if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Using device: cpu
Loading and preprocessing data...
Training on 1600 examples, testing on 400 examples


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training universal spam classifier...
Epoch 1/1


 24%|██▍       | 392/1600 [18:54<58:17,  2.90s/it]


KeyboardInterrupt: 

# **Installing Dependencies & Importing Libraries**

This cell installs the required Python packages `(transformers, torch, pandas, and scikit-learn)` and imports them for use in the notebook. The transformers library provides the RoBERTa model and tokenizer, while torch is used for deep learning operations. pandas handles data loading and preprocessing, and scikit-learn is used for splitting the dataset into training and testing sets. The GradScaler and autocast from torch.cuda.amp enable mixed-precision training, which speeds up training on GPUs while maintaining accuracy.

In [2]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast
import time

# **Device Configuration & Hyperparameters**

This cell sets up the computing device `(GPU if available, otherwise CPU)` and configures TensorFloat-32 `(TF32)` for faster matrix operations on supported GPUs. The hyperparameters defined here control the model's behavior:

MAX_LENGTH = 128: Truncates or pads input text to this length.

BATCH_SIZE = 32: Number of samples processed per training step.

EPOCHS = 3: Number of full passes through the dataset.

SAMPLE_SIZE = 800: Limits the dataset size for faster experimentation.

MODEL_NAME = "distilroberta-base": Uses a smaller, faster version of RoBERTa.


In [7]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cuda':
    torch.backends.cuda.matmul.allow_tf32 = True
print(f"Using device: {device}")

# Hyperparameters
MAX_LENGTH = 128
BATCH_SIZE = 32
EPOCHS = 10
SAMPLE_SIZE = 800
MODEL_NAME = "distilroberta-base"

Using device: cuda


# **Model Initialization & Optimizer Setup**

Here, the tokenizer and model are loaded from Hugging Face’s `transformers` library. The `RobertaForSequenceClassification` model is initialized with a new classification head (since it’s being fine-tuned for spam detection). The `ignore_mismatched_sizes=True` flag suppresses warnings about the new randomly initialized classifier layer. The AdamW optimizer is used with a learning rate of `5e-5`, which is standard for fine-tuning transformer models. The `GradScaler` is initialized to manage gradient scaling for mixed-precision training.



In [8]:
# ========== MODEL SETUP ==========
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
model = RobertaForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    ignore_mismatched_sizes=True
).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


# **Data Loading & Evaluation Functions**

This cell defines two key functions:

**`load_data():`**

1. Reads a CSV file (either SMS or email format).

2. Handles encoding issues (falling back to latin1 if UTF-8 fails).

3. Preprocesses text (lowercasing and truncation).

4. Splits data into 80% training and 20% testing sets.

**`evaluate():`**

1. Computes model accuracy on test data.

2. Uses larger batches (BATCH_SIZE * 2) for faster evaluation.

3. Temporarily switches the model to evaluation mode (model.eval()) for inference.

In [9]:
# ========== DATA LOADING ==========
def load_data(file_path, is_sms=False):
    """Load dataset with proper encoding handling"""
    encoding = "ISO-8859-1" if is_sms else "utf-8"
    try:
        df = pd.read_csv(file_path, encoding=encoding)
    except UnicodeDecodeError:
        df = pd.read_csv(file_path, encoding='latin1')

    if is_sms:
        df = df[['v1', 'v2']].sample(min(SAMPLE_SIZE, len(df)), random_state=42)
        texts, labels = df['v2'].tolist(), (df['v1'] == 'spam').astype(int).tolist()
    else:
        df = df[['text', 'label_num']].sample(min(SAMPLE_SIZE, len(df)), random_state=42)
        texts, labels = df['text'].tolist(), df['label_num'].tolist()

    texts = [str(t).lower()[:500] for t in texts]
    return train_test_split(texts, labels, test_size=0.2, random_state=42)

# ========== EVALUATION FUNCTION ==========
def evaluate(X_test, y_test):
    model.eval()
    test_encodings = tokenizer(X_test, padding=True, truncation=True,
                             max_length=MAX_LENGTH, return_tensors="pt")
    test_dataset = torch.utils.data.TensorDataset(
        test_encodings['input_ids'].to(device),
        test_encodings['attention_mask'].to(device),
        torch.tensor(y_test).to(device)
    )
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE*2)

    correct = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            correct += (outputs.logits.argmax(1) == labels).sum().item()

    model.train()
    return correct / len(y_test)

# **Training Loop & Execution**

This cell contains the core training logic and executes the full workflow:

**`train() Function:`**

1. Tokenizes and batches training data.

2. Uses mixed-precision training (autocast) on GPU for speed.

3. Updates model weights using gradient scaling (GradScaler) if on GPU.

4. Prints per-epoch metrics (loss, validation accuracy, and time).

**`Main Execution:`**

1. Loads the dataset (spam.csv).

2. Starts training and times the process.

3. Reports final accuracy on the test set.

**`Expected Output:`**

* Training progress for each epoch (loss and accuracy).

* Total training time and final model performance.

In [10]:
def train(X_train, y_train, X_test, y_test):
    train_encodings = tokenizer(X_train, padding=True, truncation=True,
                              max_length=MAX_LENGTH, return_tensors="pt")
    train_dataset = torch.utils.data.TensorDataset(
        train_encodings['input_ids'].to(device),
        train_encodings['attention_mask'].to(device),
        torch.tensor(y_train).to(device)
    )
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    for epoch in range(EPOCHS):
        model.train()
        epoch_loss = 0
        start_time = time.time()

        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch

            with autocast(enabled=(device.type == 'cuda')):
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            if device.type == 'cuda':
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                optimizer.step()

            epoch_loss += loss.item()

        val_acc = evaluate(X_test[:200], y_test[:200])
        print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {epoch_loss/len(train_loader):.4f} | Val Acc: {val_acc:.4f} | Time: {time.time()-start_time:.2f}s")

# ========== MAIN EXECUTION ==========
if __name__ == "__main__":
    print("Loading data...")
    X_train, X_test, y_train, y_test = load_data("../content/spam.csv", is_sms=True)

    print(f"\nTraining on {len(X_train)} samples")
    print("Starting training...")
    train_start = time.time()
    train(X_train, y_train, X_test, y_test)

    final_acc = evaluate(X_test, y_test)
    print(f"\nTraining completed in {time.time()-train_start:.2f} seconds")
    print(f"Final Accuracy: {final_acc:.4f}")

Loading data...

Training on 640 samples
Starting training...


  with autocast(enabled=(device.type == 'cuda')):


Epoch 1/10 | Loss: 0.3159 | Val Acc: 0.9688 | Time: 2.42s
Epoch 2/10 | Loss: 0.0348 | Val Acc: 0.9938 | Time: 2.45s
Epoch 3/10 | Loss: 0.0360 | Val Acc: 0.9812 | Time: 2.45s
Epoch 4/10 | Loss: 0.0265 | Val Acc: 0.9938 | Time: 2.41s
Epoch 5/10 | Loss: 0.0049 | Val Acc: 0.9875 | Time: 2.40s
Epoch 6/10 | Loss: 0.0011 | Val Acc: 0.9875 | Time: 2.51s
Epoch 7/10 | Loss: 0.0008 | Val Acc: 0.9875 | Time: 2.63s
Epoch 8/10 | Loss: 0.0004 | Val Acc: 0.9938 | Time: 2.46s
Epoch 9/10 | Loss: 0.0006 | Val Acc: 0.9938 | Time: 2.62s
Epoch 10/10 | Loss: 0.0003 | Val Acc: 0.9875 | Time: 2.61s

Training completed in 25.68 seconds
Final Accuracy: 0.9875


# **COMPLETE CODE SEPARATELY!!!**

In [7]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast
import time

# ========== Device Setup ==========
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cuda':
    torch.backends.cuda.matmul.allow_tf32 = True
print(f"Using device: {device}")

# ========== HYPERPARAMETERS ==========
MAX_LENGTH = 128
BATCH_SIZE = 32
EPOCHS = 10
SAMPLE_SIZE = 800
MODEL_NAME = "distilroberta-base"

# ========== MODEL SETUP ==========
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
model = RobertaForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    ignore_mismatched_sizes=True
).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

# ========== DATA LOADING ==========
def load_data(file_path, is_sms=False):
    """Load dataset with proper encoding handling"""
    encoding = "ISO-8859-1" if is_sms else "utf-8"
    try:
        df = pd.read_csv(file_path, encoding=encoding)
    except UnicodeDecodeError:
        df = pd.read_csv(file_path, encoding='latin1')

    if is_sms:
        df = df[['v1', 'v2']].sample(min(SAMPLE_SIZE, len(df)), random_state=42)
        texts, labels = df['v2'].tolist(), (df['v1'] == 'spam').astype(int).tolist()
    else:
        df = df[['text', 'label_num']].sample(min(SAMPLE_SIZE, len(df)), random_state=42)
        texts, labels = df['text'].tolist(), df['label_num'].tolist()

    texts = [str(t).lower()[:500] for t in texts]
    return train_test_split(texts, labels, test_size=0.2, random_state=42)

# ========== EVALUATION FUNCTION ==========
def evaluate(X_test, y_test):
    model.eval()
    test_encodings = tokenizer(X_test, padding=True, truncation=True,
                             max_length=MAX_LENGTH, return_tensors="pt")
    test_dataset = torch.utils.data.TensorDataset(
        test_encodings['input_ids'].to(device),
        test_encodings['attention_mask'].to(device),
        torch.tensor(y_test).to(device)
    )
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE*2)

    correct = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            correct += (outputs.logits.argmax(1) == labels).sum().item()

    model.train()
    return correct / len(y_test)

# ========== TRAINING LOOP ==========
def train(X_train, y_train, X_test, y_test):
    train_encodings = tokenizer(X_train, padding=True, truncation=True,
                              max_length=MAX_LENGTH, return_tensors="pt")
    train_dataset = torch.utils.data.TensorDataset(
        train_encodings['input_ids'].to(device),
        train_encodings['attention_mask'].to(device),
        torch.tensor(y_train).to(device)
    )
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    for epoch in range(EPOCHS):
        model.train()
        epoch_loss = 0
        start_time = time.time()

        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch

            with autocast(enabled=(device.type == 'cuda')):
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            if device.type == 'cuda':
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                optimizer.step()

            epoch_loss += loss.item()

        val_acc = evaluate(X_test[:200], y_test[:200])
        print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {epoch_loss/len(train_loader):.4f} | Val Acc: {val_acc:.4f} | Time: {time.time()-start_time:.2f}s")

# ========== MAIN EXECUTION ==========
if __name__ == "__main__":
    print("Loading data...")
    X_train, X_test, y_train, y_test = load_data("../content/spam.csv", is_sms=True)

    print(f"\nTraining on {len(X_train)} samples")
    print("Starting training...")
    train_start = time.time()
    train(X_train, y_train, X_test, y_test)

    final_acc = evaluate(X_test, y_test)
    print(f"\nTraining completed in {time.time()-train_start:.2f} seconds")
    print(f"Final Accuracy: {final_acc:.4f}")

Using device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


Loading data...

Training on 640 samples
Starting training...


  with autocast(enabled=(device.type == 'cuda')):


Epoch 1/10 | Loss: 0.2841 | Val Acc: 0.9750 | Time: 2.75s
Epoch 2/10 | Loss: 0.0707 | Val Acc: 0.9938 | Time: 2.41s
Epoch 3/10 | Loss: 0.0393 | Val Acc: 0.9938 | Time: 2.47s
Epoch 4/10 | Loss: 0.0126 | Val Acc: 0.9938 | Time: 2.48s
Epoch 5/10 | Loss: 0.0305 | Val Acc: 0.9875 | Time: 2.54s
Epoch 6/10 | Loss: 0.0068 | Val Acc: 0.9875 | Time: 2.53s
Epoch 7/10 | Loss: 0.0177 | Val Acc: 0.9875 | Time: 2.52s
Epoch 8/10 | Loss: 0.0046 | Val Acc: 0.9812 | Time: 2.71s
Epoch 9/10 | Loss: 0.0114 | Val Acc: 0.9812 | Time: 2.55s
Epoch 10/10 | Loss: 0.0018 | Val Acc: 0.9938 | Time: 2.59s

Training completed in 26.65 seconds
Final Accuracy: 0.9938
