In [112]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

In [113]:
# Load data
dataset = pd.read_csv("jigsaw-toxic-comment-train.csv")


In [114]:
dataset.shape

(223549, 8)

In [115]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate','neutral']

In [116]:
dataset['neutral'] = ((dataset[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) == 0).astype(int))

In [117]:
dataset

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,neutral
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
223544,fff8f64043129fa2,":Jerome, I see you never got around to this…! ...",0,0,0,0,0,0,1
223545,fff9d70fe0722906,==Lucky bastard== \n http://wikimediafoundatio...,0,0,0,0,0,0,1
223546,fffa8a11c4378854,==shame on you all!!!== \n\n You want to speak...,0,0,0,0,0,0,1
223547,fffac2a094c8e0e2,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1,0,1,0,1,0,0


In [118]:
class_counts = dataset[classes].sum()
print(class_counts)

toxic             21384
severe_toxic       1962
obscene           12140
threat              689
insult            11304
identity_hate      2117
neutral          201081
dtype: int64


In [119]:
import pandas as pd

# Assuming 'dataset' is your original DataFrame and 'classes' is a list of class column names
df = pd.DataFrame()

for cls in classes:
    # Count the number of entries in the current class
    count = dataset[dataset[cls] == 1].shape[0]
    
    
    if count > 1500:
        # Sample 1500 rows if the class has more than 1500 entries
        sampled_class = dataset[dataset[cls] == 1].sample(n=1500, random_state=42)
    else:
        # Keep all rows if the class has 1500 or fewer entries
        sampled_class = dataset[dataset[cls] == 1]
    
    # Concatenate the sampled (or original) class data to the final DataFrame
    df = pd.concat([df, sampled_class])

# Reset the index of the resulting DataFrame
df.reset_index(drop=True, inplace=True)


In [120]:
from sklearn.model_selection import train_test_split
X = pd.DataFrame(df['comment_text'])
y = pd.DataFrame(df[classes])
X_train,X_eval_test,y_train,y_eval_test = train_test_split(X,y,test_size = 0.5,random_state=42)
X_val,X_test,y_val,y_test = train_test_split(X_eval_test,y_eval_test,test_size=0.5,random_state=42)

In [121]:
print(type(X_train))  # <class 'pandas.core.frame.DataFrame'>


<class 'pandas.core.frame.DataFrame'>


In [122]:
X_test.shape

(2423, 1)

In [123]:
# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

In [124]:
# Tokenization function using Hugging Face tokenizer
def encode_texts(texts, tokenizer, maxlen=512):
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=maxlen,
        return_tensors='pt'
    )


In [125]:
X_test.columns

Index(['comment_text'], dtype='object')

In [126]:
# Encode datasets
x_train = encode_texts(X_train.comment_text.astype(str), tokenizer, maxlen=192)
x_valid = encode_texts(X_val.comment_text.astype(str), tokenizer, maxlen=192)
x_test = encode_texts(X_test.comment_text.astype(str), tokenizer, maxlen=192)

In [152]:
y_train = torch.tensor(y_train[classes].values)
y_valid = torch.tensor(y_val[classes].values)
# y_train = torch.tensor()

IndexError: too many indices for tensor of dimension 1

In [144]:
x_test

{'input_ids': tensor([[  101,   117, 12174,  ...,     0,     0,     0],
        [  101, 93951, 10230,  ...,     0,     0,     0],
        [  101, 10747, 29115,  ...,     0,     0,     0],
        ...,
        [  101,   107, 11005,  ...,     0,     0,     0],
        [  101, 26092, 11170,  ...,     0,     0,     0],
        [  101,   107,   131,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [128]:
# Custom dataset class
class ToxicCommentsDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [149]:
# Dataset and Dataloader
train_dataset = ToxicCommentsDataset(x_train, y_train)
valid_dataset = ToxicCommentsDataset(x_valid, y_valid)


train_loader = DataLoader(train_dataset, batch_size=16)  # Try reducing batch size
valid_loader = DataLoader(valid_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

In [153]:
classes

['toxic',
 'severe_toxic',
 'obscene',
 'threat',
 'insult',
 'identity_hate',
 'neutral']

In [154]:
final_classes = ['Dangerous','Potentially dangerous','Neutral']

In [132]:
# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)
total_steps = len(train_loader) * 3  
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=total_steps)

# Loss function
loss_fn = torch.nn.BCEWithLogitsLoss()



In [133]:
# Training loop
def train_epoch(model, dataloader, optimizer, device, scheduler):
    model.train()
    total_loss = 0
    correct_predictions = 0
    
    for batch in tqdm(dataloader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].unsqueeze(1).float().to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [138]:
def eval_model(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].unsqueeze(1).float().to(device)  # Assuming binary classification
            
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()

            # Binary classification: Get predicted class (0 or 1) by applying a threshold of 0.5
            preds = torch.sigmoid(outputs.logits)
            preds = (preds > 0.5).float()  # Convert to 0 or 1 based on threshold

            # Calculate the number of correct predictions
            correct_predictions += (preds == labels).sum().item()
            total_samples += labels.size(0)  # Count total samples

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_samples  # Calculate accuracy
    return avg_loss, accuracy

In [140]:
# Assuming your training loop is already defined as you provided
epochs = 5
best_val_loss = float('inf')  # Initialize best validation loss to infinity
best_val_accuracy = 0.0  # Initialize best validation accuracy to 0

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss = train_epoch(model, train_loader, optimizer, device, scheduler)
    
    # Ensure that eval_model only returns two values
    val_loss, val_accuracy = eval_model(model, valid_loader, device)  
    
    print(f"Train Loss: {train_loss}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy:.2f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model_loss.pth')  
        print(f"Best model saved based on Validation Loss: {val_loss:.4f}")

    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), 'best_model_accuracy.pth')
        print(f"Best model saved based on Validation Accuracy: {val_accuracy:.4f}")


Epoch 1/5


100%|██████████| 303/303 [21:44<00:00,  4.31s/it]


Train Loss: 0.1211324416549735, Validation Loss: 0.20373806166830227, Validation Accuracy: 0.93
Best model saved based on Validation Loss: 0.2037
Best model saved based on Validation Accuracy: 0.9315
Epoch 2/5


100%|██████████| 303/303 [26:30<00:00,  5.25s/it]


Train Loss: 0.11140237292797357, Validation Loss: 0.20373806166830227, Validation Accuracy: 0.93
Epoch 3/5


100%|██████████| 303/303 [16:33<00:00,  3.28s/it]


Train Loss: 0.10999113157170244, Validation Loss: 0.20373806166830227, Validation Accuracy: 0.93
Epoch 4/5


100%|██████████| 303/303 [16:27<00:00,  3.26s/it]


Train Loss: 0.11153200930506975, Validation Loss: 0.20373806166830227, Validation Accuracy: 0.93
Epoch 5/5


100%|██████████| 303/303 [16:23<00:00,  3.25s/it]


Train Loss: 0.11123406970658319, Validation Loss: 0.20373806166830227, Validation Accuracy: 0.93


In [63]:
x_test = encode_texts(test1.content.astype(str), tokenizer, maxlen=192)

In [None]:
model = model.load_state_dict(torch.load('best_model_loss.pth')) 

In [150]:
# Test Dataset
test_dataset = ToxicCommentsDataset(x_test)

test_loader = DataLoader(test_dataset, batch_size=32)

# Prediction
model.eval()
predictions = []

with torch.no_grad():
    for batch in valid_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.sigmoid(outputs.logits).cpu().numpy()
        predictions.extend(preds)

# Convert predictions to a binary format if needed
# binary_predictions = [1 if pred >= 0.5 else 0 for pred in predictions]
print(predictions)

[array([0.956714], dtype=float32), array([0.99281996], dtype=float32), array([0.9915987], dtype=float32), array([0.92225295], dtype=float32), array([0.9618363], dtype=float32), array([0.9905128], dtype=float32), array([0.58998233], dtype=float32), array([0.9800613], dtype=float32), array([0.97308445], dtype=float32), array([0.9924408], dtype=float32), array([0.0249439], dtype=float32), array([0.96982175], dtype=float32), array([0.9376915], dtype=float32), array([0.03353335], dtype=float32), array([0.98062253], dtype=float32), array([0.988605], dtype=float32), array([0.9896611], dtype=float32), array([0.02527755], dtype=float32), array([0.9911299], dtype=float32), array([0.99237263], dtype=float32), array([0.5480378], dtype=float32), array([0.1311955], dtype=float32), array([0.9895978], dtype=float32), array([0.9877571], dtype=float32), array([0.0318653], dtype=float32), array([0.98345727], dtype=float32), array([0.97909385], dtype=float32), array([0.8360187], dtype=float32), array([0.0

In [147]:
def test_model(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    total_correct = 0
    total_samples = 0
    total_loss = 0

    with torch.no_grad():  # Disable gradient computation
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].unsqueeze(1).float().to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()

            # Get predicted class (binary classification: 0 or 1)
            preds = torch.sigmoid(outputs.logits)
            preds = (preds > 0.5).float()

            # Calculate the number of correct predictions
            total_correct += (preds == labels).sum().item()
            total_samples += labels.size(0)

    # Compute the average loss and accuracy
    avg_loss = total_loss / len(test_loader)
    accuracy = total_correct / total_samples
    print(f"Test Loss: {avg_loss:.4f}, Test Accuracy: {accuracy:.4f}")

    return avg_loss, accuracy

In [151]:
test_loss, test_accuracy = test_model(model, test_loader, 'cpu')

TypeError: object of type 'NoneType' has no len()