In [4]:
!pip install transformers torch pandas numpy emoji scikit-learn

Defaulting to user installation because normal site-packages is not writeable


In [5]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split

# Load datasets
jigsaw = pd.read_csv("train.csv")
emotag = pd.read_csv("EmoTag1200-scores.csv")

# Convert labels to float32 upfront
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
jigsaw[label_cols] = jigsaw[label_cols].astype('float32')

# Process EmoTag data
toxicity_weights = {'anger': 0.9, 'disgust': 0.85, 'joy': -0.9, 'fear': 0.4}
emotag['toxicity_score'] = sum(
    emotag[col]*weight for col,weight in toxicity_weights.items() 
    if col in emotag.columns
)
emoji_toxicity = dict(zip(emotag['emoji'], emotag['toxicity_score']))

# Add emoji features (ensure float32)
def extract_emoji_features(text):
    emojis = [c for c in text if c in emoji_toxicity]
    if not emojis: 
        return np.array([0, 0, 0], dtype='float32')
    toxicities = [emoji_toxicity[e] for e in emojis]
    return np.array([
        len(emojis), 
        np.mean(toxicities), 
        max(toxicities)
    ], dtype='float32')

jigsaw[['emoji_cnt', 'emoji_avg', 'emoji_max']] = np.stack(
    jigsaw['comment_text'].apply(extract_emoji_features)
)

In [6]:
from torch.utils.data import Dataset
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class ToxicCommentDataset(Dataset):
    def __init__(self, texts, emoji_feats, labels):
        self.texts = texts
        self.emoji_feats = emoji_feats.astype('float32')  # Explicit conversion
        self.labels = labels.astype('float32')            # Explicit conversion
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'emoji_feats': torch.from_numpy(self.emoji_feats[idx]),  # Already float32
            'labels': torch.from_numpy(self.labels[idx])              # Already float32
        }

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from torch.utils.data import DataLoader

# Train-test split
train_df, val_df = train_test_split(jigsaw, test_size=0.2, random_state=42)

# Create datasets
train_dataset = ToxicCommentDataset(
    texts=train_df['comment_text'].tolist(),
    emoji_feats=train_df[['emoji_cnt', 'emoji_avg', 'emoji_max']].values,
    labels=train_df[label_cols].values
)

val_dataset = ToxicCommentDataset(
    texts=val_df['comment_text'].tolist(),
    emoji_feats=val_df[['emoji_cnt', 'emoji_avg', 'emoji_max']].values,
    labels=val_df[label_cols].values
)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [8]:
from transformers import DistilBertModel
import torch.nn as nn

class ToxicityClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.emoji_layer = nn.Linear(3, 64)  # 3 emoji features
        self.classifier = nn.Linear(768 + 64, 6)  # DistilBERT hidden size + emoji
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, input_ids, attention_mask, emoji_feats):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = bert_out.last_hidden_state[:, 0, :]  # CLS token
        emoji_out = self.emoji_layer(emoji_feats)
        combined = self.dropout(torch.cat([pooled, emoji_out], dim=1))
        return self.classifier(combined)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ToxicityClassifier().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.BCEWithLogitsLoss()

In [11]:
from sklearn.metrics import classification_report, f1_score
import numpy as np
from tqdm import tqdm  # For progress bars

def compute_metrics(preds, labels):
    preds = (torch.sigmoid(preds) > 0.5).int().cpu()
    return {
        'accuracy': (preds == labels).float().mean().item(),
        'macro_f1': f1_score(labels, preds, average='macro', zero_division=0)
    }

# Modified for 3 epochs
for epoch in range(3):  
    # Training with progress bar
    model.train()
    train_loss = 0
    train_batches = len(train_loader)
    
    # Wrap train_loader with tqdm for progress
    with tqdm(train_loader, unit="batch", desc=f"Epoch {epoch+1}/3 Training") as tepoch:
        for batch in tepoch:
            optimizer.zero_grad()
            inputs = {k:v.to(device) for k,v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            
            outputs = model(**inputs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            
            # Update progress bar description
            tepoch.set_postfix({
                'loss': f"{loss.item():.4f}",
                'progress': f"{(tepoch.n/tepoch.total)*100:.1f}%"
            })
    
    # Validation with progress
    model.eval()
    val_preds, val_labels = [], []
    val_batches = len(val_loader)
    
    with torch.no_grad(), tqdm(val_loader, unit="batch", desc=f"Epoch {epoch+1}/3 Validation") as vepoch:
        for batch in vepoch:
            inputs = {k:v.to(device) for k,v in batch.items() if k != 'labels'}
            val_labels.append(batch['labels'].cpu())
            outputs = model(**inputs)
            val_preds.append(outputs.cpu())
            
            vepoch.set_postfix({
                'progress': f"{(vepoch.n/vepoch.total)*100:.1f}%"
            })
    
    # Calculate metrics
    val_preds = torch.cat(val_preds)
    val_labels = torch.cat(val_labels)
    metrics = compute_metrics(val_preds, val_labels)
    
    print(f"\nEpoch {epoch+1} Results:")
    print(f"Train Loss: {train_loss/len(train_loader):.4f}")
    print(f"Val Accuracy: {metrics['accuracy']:.4f} ({metrics['accuracy']*100:.1f}%)")
    print(f"Val Macro F1: {metrics['macro_f1']:.4f} ({metrics['macro_f1']*100:.1f}%)")
    
    # Show full report after last epoch
    if epoch == 2:  
        print("\nFinal Classification Report:")
        print(classification_report(
            val_labels.numpy(),
            (torch.sigmoid(val_preds) > 0.5).int().numpy(),
            target_names=label_cols,
            digits=4
        ))

Epoch 1/3 Training:   0%|          | 1/3990 [00:17<19:24:15, 17.51s/batch, loss=0.7195, progress=0.0%]


KeyboardInterrupt: 

In [14]:
from tqdm import tqdm

# Load your saved model with CPU mapping
model.load_state_dict(
    torch.load('best_model.pt', map_location=torch.device('cpu'))
)
model.to(device)
model.eval()

val_preds = []
val_labels = []

with torch.no_grad():
    # Add progress bar with batch count and timing
    for batch in tqdm(val_loader, desc="Evaluating", unit="batch", total=len(val_loader)):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        val_labels.append(batch['labels'].cpu())
        outputs = model(**inputs)
        val_preds.append(outputs.cpu())

val_preds = torch.cat(val_preds)
val_labels = torch.cat(val_labels)

# Calculate metrics using your existing function
metrics = compute_metrics(val_preds, val_labels)

print("\nModel Evaluation Results:")
print(f"Validation Accuracy: {metrics['accuracy']:.4f} ({metrics['accuracy']*100:.1f}%)")
print(f"Validation Macro F1: {metrics['macro_f1']:.4f} ({metrics['macro_f1']*100:.1f}%)")

print("\nClassification Report:")
print(classification_report(
    val_labels.numpy(),
    (torch.sigmoid(val_preds) > 0.5).int().numpy(),
    target_names=label_cols,
    digits=4
))

Evaluating: 100%|██████████| 998/998 [51:14<00:00,  3.08s/batch]



Model Evaluation Results:
Validation Accuracy: 0.9849 (98.5%)
Validation Macro F1: 0.6289 (62.9%)

Classification Report:
               precision    recall  f1-score   support

        toxic     0.8413    0.8259    0.8336      3056
 severe_toxic     0.6336    0.2586    0.3673       321
      obscene     0.8507    0.8175    0.8338      1715
       threat     0.5909    0.3514    0.4407        74
       insult     0.7735    0.7509    0.7620      1614
identity_hate     0.7159    0.4286    0.5362       294

    micro avg     0.8183    0.7595    0.7878      7074
    macro avg     0.7343    0.5721    0.6289      7074
 weighted avg     0.8109    0.7595    0.7797      7074
  samples avg     0.0728    0.0717    0.0698      7074



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
