### Adapting `train_distilbert.py` for cuDF

Since cuDF is a GPU-accelerated DataFrame library, we’ll replace pandas operations with cuDF where applicable. The script’s core training loop (PyTorch/Transformers) remains unchanged, but data loading, filtering, and augmentation will use cuDF. Note that `nlpaug` and some sklearn metrics don’t natively support cuDF, so we’ll convert back to pandas/NumPy where needed.

### Leverage RAPIDS

The `%load_ext cuml.accel` extension loads RAPIDS libraries for GPU acceleration, but your current script relies on PyTorch and Hugging Face transformers, which use their own CUDA support. cuML acceleration won’t directly apply to this transformer-based code unless you integrate RAPIDS-compatible operations (e.g., preprocessing with cuDF). For now, ensure your Colab runtime is set to GPU to leverage PyTorch’s CUDA support.

### Key Changes:

- **WandB logging disabled**:
    - Added `report_to=[]` to `training_args_3class` and `binary_training_args` to disable WandB logging, avoiding the API key prompt.
    - Retained all previous fixes (NLTK downloads, `num_items_in_batch` handling, robust `handle_reviews`).

- **cuDF Integration**:
    - Replaced `pandas.read_csv` with `cudf.read_csv` for GPU-accelerated data loading.
    - Used `cudf.DataFrame` for data manipulation (e.g., `augmented_df`, `train_binary_df`, `test_binary_df`).
    - Adapted `handle_reviews` to use cuDF’s `apply_rows` for row-wise operations.
    - Converted cuDF Series to pandas/NumPy where required (e.g., `to_pandas().tolist()` for `nlpaug`, `to_pandas().values` for `SentimentDataset`).

- **Binary Classification**:
    - Used `apply_rows` for the binary label transformation to maintain cuDF compatibility.

- **Device**:
    - Set `device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')` to leverage GPU if available.

In [None]:
%load_ext cuml.accel

In [None]:
! pip install nlpaug -quiet

In [None]:
import cudf
import numpy as np
import torch
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from torch.utils.data import Dataset
import nlpaug.augmenter.word as naw
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    precision_recall_curve,
    auc,
    recall_score,
    precision_score,
)
import matplotlib.pyplot as plt
import seaborn as sns
import nltk


def download_nltk_resources():
    """Download required NLTK resources."""
    resources = ["averaged_perceptron_tagger_eng", "wordnet", "punkt", "punkt_tab"]
    for resource in resources:
        try:
            nltk.download(resource, quiet=True)
            print(f"Successfully downloaded NLTK resource: {resource}")
        except Exception as e:
            print(f"Failed to download NLTK resource {resource}: {str(e)}")
            raise


# Download NLTK resources
download_nltk_resources()


In [None]:
# Load data with cudf
train_df = cudf.read_csv("../data/interim/train.csv")
test_df = cudf.read_csv("../data/interim/test.csv")

# Handle empty processed reviews
train_df["processed_reviews"] = train_df.apply(
    lambda x: (
        x["verified_reviews"]
        if x["processed_reviews"] == ""
        else x["processed_reviews"]
    ),
    axis=1,
)
test_df["processed_reviews"] = test_df.apply(
    lambda x: (
        x["verified_reviews"]
        if x["processed_reviews"] == ""
        else x["processed_reviews"]
    ),
    axis=1,
)

# Calculate class weights (convert to pandas for numpy operations)
train_df_pd = train_df.to_pandas()
neg_count = len(train_df_pd[train_df_pd['feedback'] == 0])
pos_count = len(train_df_pd[train_df_pd['feedback'] == 1])
neu_count = len(train_df_pd[train_df_pd['feedback'] == 2])
class_weight_dict = {0: pos_count / neg_count, 1: 1.0, 2: pos_count / neu_count}

print(f"Class distribution - Negative: {neg_count}, Positive: {pos_count}, Neutral: {neu_count}")
print(f"Class weights: {class_weight_dict}")

# Augment negative reviews (convert to pandas for nlpaug)
neg_texts = train_df[train_df['feedback'] == 0]['processed_reviews'].to_pandas().tolist()
neg_labels = train_df[train_df['feedback'] == 0]['feedback'].to_pandas().tolist()
aug = naw.SynonymAug(aug_p=0.3)
augmented_texts = [aug.augment(text)[0] for text in neg_texts]
augmented_df = cudf.DataFrame(
    {
        'processed_reviews': augmented_texts + train_df['processed_reviews'].to_pandas().tolist(),
        'feedback': neg_labels + train_df['feedback'].to_pandas().tolist(),
    }
)

# Prepare data for three-class transfer learning
X_train_aug = augmented_df['processed_reviews']
y_train_aug = augmented_df['feedback'].astype('int32')
X_test = test_df['processed_reviews']
y_test = test_df['feedback'].astype('int32')

In [None]:
# Custom Dataset
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(
            texts.to_pandas().tolist(),  # Convert cuDF Series to list for tokenizer
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )
        self.labels = labels.to_pandas().values  # Convert to NumPy array

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long),
        }

# Focal Loss
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=1.0, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = torch.nn.functional.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss

# Custom Trainer with Focal Loss
class FocalTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop('labels').to(model.device)
        outputs = model(**{k: v.to(model.device) for k, v in inputs.items()})
        logits = outputs.logits
        loss_fct = FocalLoss(alpha=class_weight_dict[0], gamma=2.0)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"GPU Available: {torch.cuda.is_available()}")

# Transfer learning (three-class)
model_3class = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=3
)
model_3class.to(device)
train_dataset_3class = SentimentDataset(X_train_aug, y_train_aug, tokenizer)
test_dataset_3class = SentimentDataset(X_test, y_test, tokenizer)

training_args_3class = TrainingArguments(
    output_dir='./distilbert_3class_results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./distilbert_3class_logs',
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='recall_neg',
    greater_is_better=True,
    report_to=[],  # Disable WandB logging
    learning_rate=2e-5,
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'recall_neg': recall_score(
            labels,
            preds,
            pos_label=0,
            average='micro' if len(np.unique(labels)) > 2 else 'binary',
        ),
        'precision_neg': precision_score(
            labels,
            preds,
            pos_label=0,
            zero_division=0,
            average='micro' if len(np.unique(labels)) > 2 else 'binary',
        ),
    }

trainer_3class = FocalTrainer(
    model=model_3class,
    args=training_args_3class,
    train_dataset=train_dataset_3class,
    eval_dataset=test_dataset_3class,
    compute_metrics=compute_metrics,
)

In [None]:
trainer_3class.train()
model_3class.save_pretrained("../models/distilbert_3class_model")
tokenizer.save_pretrained("../models/distilbert_3class_model")

In [None]:
# Fine-tune for binary classification
train_binary_df = train_df[train_df['feedback'] != 2].copy()
test_binary_df = test_df[test_df['feedback'] != 2].copy()

# The filtering step above already ensures the 'feedback' column
# contains only 0 and 1, which are the desired binary labels.
# No further transformation with apply_rows is needed.
# train_binary_df['feedback'] = train_binary_df['feedback'].apply_rows(
#     lambda x: 0 if x[0] == 0 else 1,
#     incols={'feedback': 'int32'},
#     outcols={'feedback': 'int32'}
# )
# test_binary_df['feedback'] = test_binary_df['feedback'].apply_rows(
#     lambda x: 0 if x[0] == 0 else 1,
#     incols={'feedback': 'int32'},
#     outcols={'feedback': 'int32'}
# )

# Augment negative reviews for binary
neg_texts_binary = train_binary_df[train_binary_df['feedback'] == 0]['processed_reviews'].to_pandas().tolist()
neg_labels_binary = train_binary_df[train_binary_df['feedback'] == 0]['feedback'].to_pandas().tolist()
aug = naw.SynonymAug(aug_p=0.3) # Ensure 'aug' is defined if running this cell independently
augmented_texts_binary = [aug.augment(text)[0] for text in neg_texts_binary]
augmented_binary_df = cudf.DataFrame(
    {
        'processed_reviews': augmented_texts_binary + train_binary_df['processed_reviews'].to_pandas().tolist(),
        'feedback': neg_labels_binary + train_binary_df['feedback'].to_pandas().tolist(),
    }
)

# Binary dataset
train_binary_dataset = SentimentDataset(
    augmented_binary_df['processed_reviews'],
    augmented_binary_df['feedback'],
    tokenizer
)
test_binary_dataset = SentimentDataset(
    test_binary_df['processed_reviews'],
    test_binary_df['feedback'],
    tokenizer
)

# Initialize binary model
model_binary = DistilBertForSequenceClassification.from_pretrained(
    '../models/distilbert_3class_model', num_labels=2, ignore_mismatched_sizes=True
)
model_binary.to(device)

# Binary training arguments
binary_training_args = TrainingArguments(
    output_dir='./distilbert_binary_results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir='./distilbert_binary_logs',
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='recall_neg',
    greater_is_better=True,
    report_to=[],  # Disable WandB logging
    learning_rate=2e-5,
)

# Binary trainer
binary_trainer = FocalTrainer(
    model=model_binary,
    args=binary_training_args,
    train_dataset=train_binary_dataset,
    eval_dataset=test_binary_dataset,
    compute_metrics=compute_metrics,
)
binary_trainer.train()
model_binary.save_pretrained('../models/distilbert_binary_model')
tokenizer.save_pretrained('../models/distilbert_binary_model')

In [None]:
# Evaluate binary model
predictions = binary_trainer.predict(test_binary_dataset)
probs = torch.softmax(torch.tensor(predictions.predictions), dim=1)[:, 1].numpy()
pred_labels = np.argmax(predictions.predictions, axis=1)

# Metrics
print(f"\nBinary Classification Report:\n{classification_report(test_binary_df['feedback'].to_pandas(), pred_labels)}")
print(f"ROC-AUC: {roc_auc_score(test_binary_df['feedback'].to_pandas(), probs)}")
precision, recall, _ = precision_recall_curve(test_binary_df['feedback'].to_pandas(), probs, pos_label=1)
print(f"PR-AUC: {auc(recall, precision)}")
cm = confusion_matrix(test_binary_df['feedback'].to_pandas(), pred_labels)
print(f"\nConfusion Matrix:\n{cm}")

In [None]:
# Visualize PR curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f"PR-AUC = {auc(recall, precision):.2f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("DistilBERT Precision-Recall Curve")
plt.legend()
plt.savefig("../reports/figures/distilbert/distilbert_pr_curve.png")
plt.show()
plt.close()

# Visualize confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("DistilBERT Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.savefig("../reports/figures/distilbert/distilbert_confusion_matrix.png")
plt.show()
plt.close()

In [None]:
# Save model
model_binary.save_pretrained("../models/distilbert_binary_model")
tokenizer.save_pretrained("../models/distilbert_binary_model")