In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch.utils.data import DataLoader, Dataset

In [38]:
# https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge/data?select=train.csv.zip

In [39]:
df = pd.read_csv('train.csv')
df['label'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].max(axis=1)
df = df.drop(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1)
df = df.rename(columns={"comment_text": "text"})
print(df.shape)
display(df.head())

(159571, 2)


Unnamed: 0,text,label
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [40]:
df_0 = df[df['label'] == 0].sample(10000, random_state=42)
df_1 = df[df['label'] == 1].sample(10000, random_state=42)
df = pd.concat([df_0, df_1], axis=0)
print(df["label"].value_counts())

label
0    10000
1    10000
Name: count, dtype: int64


In [41]:
# Split the dataset into training and testing sets (80% train, 20% test)
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
print(train_data.shape, test_data.shape)

# Use CountVectorizer to convert text to numerical features (bag-of-words)
# This creates a vocabulary based on the training data and transforms the text into vectors
vectorizer = TfidfVectorizer(stop_words="english")  # Limit vocabulary size to 5000 words
X_train = vectorizer.fit_transform(train_data['text']).toarray()
X_test = vectorizer.transform(test_data['text']).toarray()

# Get labels for training and test sets
y_train = train_data['label'].values
y_test = test_data['label'].values

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(16000, 2) (4000, 2)
(16000, 44099) (4000, 44099) (16000,) (4000,)


In [42]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

In [43]:
# Define a custom dataset class to handle the input-output pairs
class TextDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

# Create training and testing datasets
train_dataset = TextDataset(X_train_tensor, y_train_tensor)
test_dataset = TextDataset(X_test_tensor, y_test_tensor)

# Create DataLoader to iterate over data in batches
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [44]:
import torch.nn as nn

# Define a simple feed-forward neural network
class TextClassifier(nn.Module):
    def __init__(self, input_dim):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # First fully connected layer
        self.relu = nn.ReLU()  # Activation function
        self.fc2 = nn.Linear(128, 1)  # Output layer (binary classification)
        self.sigmoid = nn.Sigmoid()  # Sigmoid for binary output
    
    def forward(self, x):
        x = self.fc1(x)  # Apply first layer
        x = self.relu(x)  # Apply ReLU activation
        x = self.fc2(x)  # Apply second layer
        x = self.sigmoid(x)  # Apply Sigmoid to get probabilities between 0 and 1
        return x

In [45]:
# Set device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model, loss function, and optimizer
model = TextClassifier(input_dim=X_train.shape[1]).to(device)  # input_dim is the number of features (words in vocabulary)
criterion = nn.BCELoss()  # Binary Cross-Entropy loss for binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer

# Function to calculate accuracy
def binary_accuracy(preds, y):
    rounded_preds = torch.round(preds)  # Round the predictions to 0 or 1
    correct = (rounded_preds == y).float()  # Check how many predictions are correct
    return correct.sum() / len(correct)  # Return accuracy

# Training loop
NUM_EPOCHS = 10  # Number of times to go over the entire training data

for epoch in range(NUM_EPOCHS):
    model.train()  # Set model to training mode
    epoch_loss = 0
    epoch_acc = 0
    
    for (inputs, labels) in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()  # Clear previous gradients
        predictions = model(inputs).squeeze(1)  # Get predictions from the model
        
        loss = criterion(predictions, labels)  # Calculate loss
        acc = binary_accuracy(predictions, labels)  # Calculate accuracy
        
        loss.backward()  # Backpropagate to compute gradients
        optimizer.step()  # Update model weights
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    print(f'Epoch {epoch+1}/{NUM_EPOCHS} | Loss: {epoch_loss/len(train_loader):.4f} | Accuracy: {epoch_acc/len(train_loader):.4f}')


Epoch 1/10 | Loss: 0.3982 | Accuracy: 0.8558
Epoch 2/10 | Loss: 0.1347 | Accuracy: 0.9534
Epoch 3/10 | Loss: 0.0556 | Accuracy: 0.9851
Epoch 4/10 | Loss: 0.0270 | Accuracy: 0.9937
Epoch 5/10 | Loss: 0.0148 | Accuracy: 0.9972
Epoch 6/10 | Loss: 0.0088 | Accuracy: 0.9988
Epoch 7/10 | Loss: 0.0060 | Accuracy: 0.9989
Epoch 8/10 | Loss: 0.0042 | Accuracy: 0.9993
Epoch 9/10 | Loss: 0.0038 | Accuracy: 0.9994
Epoch 10/10 | Loss: 0.0028 | Accuracy: 0.9994


In [46]:
# Set model to evaluation mode
model.eval()

test_loss = 0
test_acc = 0

with torch.no_grad():  # Disable gradient computation for evaluation
    for (inputs, labels) in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        predictions = model(inputs).squeeze(1)
        
        loss = criterion(predictions, labels)  # Calculate loss
        acc = binary_accuracy(predictions, labels)  # Calculate accuracy
        
        test_loss += loss.item()
        test_acc += acc.item()

print(f'Test Loss: {test_loss/len(test_loader):.4f} | Test Accuracy: {test_acc/len(test_loader):.4f}')


Test Loss: 0.5747 | Test Accuracy: 0.8630


In [58]:
# Function to classify a new text
def predict_sentiment(model, text, vectorizer, device, threshold=0.5):
    model.eval()  # Set model to evaluation mode
    
    # Transform text into numerical form using the trained vectorizer
    vectorized_text = vectorizer.transform([text]).toarray()
    
    # Convert to tensor
    tensor_text = torch.tensor(vectorized_text, dtype=torch.float32).to(device)
    
    # Get prediction
    with torch.no_grad():
        prediction = model(tensor_text).item()
    
    # Convert to binary class based on threshold
    return "Negative" if prediction >= threshold else "Positive", prediction


In [62]:
# Example usage
new_text = "I will eat a health meal!"
print(predict_sentiment(model, new_text, vectorizer, device))

('Positive', 0.005528016947209835)


In [49]:
print(device)

cpu
