# Toxic Comment Classifier - Model Training
- Train both TF-IDF + Logistic Regression and DistilBERT
- Save both models for inference

In [1]:
# Setup
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import os

os.makedirs('models', exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


## 1. Load and preprocess dataset

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
# Ensure you have the 'train.csv' file from the Kaggle competition 
# (Jigsaw Toxic Comment Classification Challenge) in your 'Mini Project' root folder.

# --- START OF UPDATED CODE ---

# 1. Load the full toxic comment dataset (assumes 'train.csv' is present)
try:
    # The Kaggle dataset has an 'id' column, the 'comment_text', and 6 label columns.
    df = pd.read_csv('train.csv')
    
    # We will use the 'comment_text' and the primary 'toxic' label.
    # The 'toxic' column has a '1' if the comment is toxic, and '0' otherwise.
    df = df[['comment_text', 'toxic']]
    
    # NOTE: The full dataset is very large (~160,000 rows). 
    # If training takes too long, you can uncomment the line below 
    # to use a smaller sample (e.g., 25,000 rows).
    # df = df.sample(n=25000, random_state=42) 
    
    print(f"Dataset loaded with {len(df)} samples.")
    
except FileNotFoundError:
    print("ERROR: train.csv not found. Please download the Kaggle dataset and place it in the project folder.")
    # Keep the tiny sample only as a fallback if the file is truly missing (DO NOT USE THIS FOR FINAL MODELS)
    df = pd.DataFrame({
        'comment_text': ['You are awesome!','You are so stupid.','What a nice person.','I hate you.','Lets be friends.'],
        'toxic': [0, 1, 0, 1, 0]
    })
# --- END OF UPDATED CODE ---

# Preprocessing (simple lowercase/strip)
df['comment_text'] = df['comment_text'].str.lower().str.strip()
X = df['comment_text']
y = df['toxic']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training on {len(X_train)} samples. Testing on {len(X_test)} samples.")

Dataset loaded with 159571 samples.
Training on 127656 samples. Testing on 31915 samples.


## 2. Train baseline model (TF-IDF + Logistic Regression)

In [3]:
vectorizer = TfidfVectorizer(max_features=500)
X_train_tfidf = vectorizer.fit_transform(X_train)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

# Save models
joblib.dump(vectorizer, 'models/tfidf_vectorizer.pkl')
joblib.dump(clf, 'models/tfidf_logreg.pkl')

['models/tfidf_logreg.pkl']

## 3. Train advanced model (DistilBERT)

In [4]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

# Initialize tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

class ToxicDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=128)
        self.labels = labels.tolist()
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        
        return item
    def __len__(self):
        return len(self.labels)

# Assuming X_train, y_train, X_test, y_test are already defined
train_dataset = ToxicDataset(X_train, y_train)
test_dataset = ToxicDataset(X_test, y_test)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_dir='./logs',
    logging_steps=5,
    save_strategy='epoch',     # Changed from 'no' to 'epoch'
    disable_tqdm=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

# Save model and tokenizer
model.save_pretrained('models/distilbert')
tokenizer.save_pretrained('models/distilbert')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.6857, 'grad_norm': 4.002100944519043, 'learning_rate': 4.999843328946544e-05, 'epoch': 7.833552672808173e-05}
{'loss': 0.4104, 'grad_norm': 3.18501877784729, 'learning_rate': 4.999647490129724e-05, 'epoch': 0.00015667105345616345}
{'loss': 0.1364, 'grad_norm': 0.9610564708709717, 'learning_rate': 4.999451651312904e-05, 'epoch': 0.00023500658018424516}
{'loss': 0.3457, 'grad_norm': 0.47764360904693604, 'learning_rate': 4.999255812496084e-05, 'epoch': 0.0003133421069123269}
{'loss': 0.8592, 'grad_norm': 10.63864803314209, 'learning_rate': 4.999059973679263e-05, 'epoch': 0.0003916776336404086}
{'loss': 0.0136, 'grad_norm': 0.20956867933273315, 'learning_rate': 4.998864134862443e-05, 'epoch': 0.0004700131603684903}
{'loss': 0.4633, 'grad_norm': 0.17883095145225525, 'learning_rate': 4.998668296045623e-05, 'epoch': 0.0005483486870965721}
{'loss': 0.4972, 'grad_norm': 0.14804033935070038, 'learning_rate': 4.998472457228803e-05, 'epoch': 0.0006266842138246538}
{'loss': 0.0089, 'grad



{'loss': 0.5478, 'grad_norm': 0.1298954337835312, 'learning_rate': 2.499960832236636e-05, 'epoch': 1.0000313342106912}
{'loss': 0.0038, 'grad_norm': 0.1126212626695633, 'learning_rate': 2.4997649934198157e-05, 'epoch': 1.0001096697374192}
{'loss': 0.0049, 'grad_norm': 0.1819542646408081, 'learning_rate': 2.4995691546029958e-05, 'epoch': 1.0001880052641474}
{'loss': 0.5357, 'grad_norm': 0.11901630461215973, 'learning_rate': 2.4993733157861755e-05, 'epoch': 1.0002663407908754}
{'loss': 0.0046, 'grad_norm': 0.12715022265911102, 'learning_rate': 2.4991774769693553e-05, 'epoch': 1.0003446763176036}
{'loss': 0.0041, 'grad_norm': 0.13520893454551697, 'learning_rate': 2.498981638152535e-05, 'epoch': 1.0004230118443316}
{'loss': 0.5734, 'grad_norm': 0.1085340678691864, 'learning_rate': 2.4987857993357148e-05, 'epoch': 1.0005013473710598}
{'loss': 0.0046, 'grad_norm': 0.11243055015802383, 'learning_rate': 2.498589960518895e-05, 'epoch': 1.0005796828977878}
{'loss': 0.5696, 'grad_norm': 0.1291290

('models/distilbert\\tokenizer_config.json',
 'models/distilbert\\special_tokens_map.json',
 'models/distilbert\\vocab.txt',
 'models/distilbert\\added_tokens.json',
 'models/distilbert\\tokenizer.json')

## 4. Test loading models (optional)

In [5]:
# Load TF-IDF model
vec = joblib.load('models/tfidf_vectorizer.pkl')
clf = joblib.load('models/tfidf_logreg.pkl')
print(clf.predict(vec.transform(['i love you'])))

# Load DistilBERT model
from transformers import DistilBertForSequenceClassification
import torch
model = DistilBertForSequenceClassification.from_pretrained('models/distilbert')
tokenizer = DistilBertTokenizerFast.from_pretrained('models/distilbert')
inputs = tokenizer('i love you', return_tensors='pt')
outputs = model(**inputs)
print(outputs.logits)

[0]
tensor([[ 4.2342, -3.2920]], grad_fn=<AddmmBackward0>)
