Fine-tuned BERT Model for Multiclass Sentiment Analysis

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the train and test datasets with error handling
train_file = '/content/train.csv'  # Update with the path to your train.csv
test_file = '/content/test.csv'    # Update with the path to your test.csv

try:
    df_train = pd.read_csv(train_file, on_bad_lines='skip', quoting=3)
    df_test = pd.read_csv(test_file, on_bad_lines='skip', quoting=3)
except pd.errors.ParserError:
    print("Error parsing the CSV file. Please check the file for formatting issues.")
    exit()

# Ensure there are no leading/trailing spaces in column names
df_train.columns = df_train.columns.str.strip().str.replace('"', '')
df_test.columns = df_test.columns.str.strip().str.replace('"', '')

# Define the target columns and prepare the data
target_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
X_train = df_train['comment_text'].fillna('').astype(str)  # Replace NaN in comments with empty string and ensure all are strings
X_test = df_test['comment_text'].fillna('').astype(str)

# Ensure the target columns are present in the training dataset
for col in target_columns:
    if col not in df_train.columns:
        raise KeyError(f"Column {col} not found in train dataset")

# Combine the target columns into a single column for multiclass classification
df_train['combined_target'] = df_train[target_columns].astype(str).agg('-'.join, axis=1)

# Encode the combined target labels
label_encoder = LabelEncoder()
df_train['encoded_target'] = label_encoder.fit_transform(df_train['combined_target'])

# Reduce the dataset size for quicker training
df_train = df_train.sample(frac=0.1, random_state=42)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    df_train['comment_text'].fillna('').astype(str),
    df_train['encoded_target'],
    test_size=0.2,
    random_state=42
)

# Define a custom dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Load the DistilBert tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create datasets
train_dataset = SentimentDataset(X_train.tolist(), y_train.tolist(), tokenizer, max_len=128)
val_dataset = SentimentDataset(X_val.tolist(), y_val.tolist(), tokenizer, max_len=128)

# Load the DistilBert model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,  # Lower the number of epochs
    per_device_train_batch_size=32,  # Increase batch size
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='steps',
    eval_steps=50
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {"accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()}
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the model and tokenizer
model.save_pretrained('fine_tuned_distilbert')
tokenizer.save_pretrained('fine_tuned_distilbert')

# Predict on the test dataset
test_texts = df_test['comment_text'].fillna('').astype(str).tolist()
test_dataset = SentimentDataset(test_texts, [0]*len(test_texts), tokenizer, max_len=128)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Get predictions
def predict(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)
    return label_encoder.inverse_transform(predictions.numpy())

# Example prediction
test_texts = df_test['comment_text'].fillna('').astype(str).tolist()
predictions = predict(test_texts)

# Add predictions to the test DataFrame
df_test['predicted'] = predictions

# Split the combined predictions back into individual labels
df_test[target_columns] = df_test['predicted'].str.split('-', expand=True)

# Convert the split columns back to integer type
for col in target_columns:
    df_test[col] = df_test[col].astype(int)

# Save the detailed predictions to a new CSV file
df_test.to_csv('distilbert_multiclass_predictions.csv', index=False)

print("Detailed predictions saved to distilbert_multiclass_predictions.csv")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,9.3125,9.210883,0.338034
100,7.2713,6.960899,0.713971
150,5.2346,4.981112,0.713971
200,3.2591,2.995152,0.717937
250,2.4602,1.979738,0.821067
300,1.4515,1.802271,0.82327
350,1.3498,1.755579,0.824152
400,1.5481,1.706249,0.817541
450,1.4745,1.685036,0.824592
500,1.124,1.672089,0.82327


Step,Training Loss,Validation Loss,Accuracy
50,9.3125,9.210883,0.338034
100,7.2713,6.960899,0.713971
150,5.2346,4.981112,0.713971
200,3.2591,2.995152,0.717937
250,2.4602,1.979738,0.821067
300,1.4515,1.802271,0.82327
350,1.3498,1.755579,0.824152
400,1.5481,1.706249,0.817541
450,1.4745,1.685036,0.824592
500,1.124,1.672089,0.82327
