In [1]:
import pandas as pd
import os
import numpy as np
from collections import Counter

In [2]:
!python --version

Python 3.9.12


## Loading the files

In [3]:
def load_labels_from_csv(csv_file):
    """Load labels from the given CSV file."""
    df = pd.read_csv(csv_file)
    return df[['file_id', 'label']]

def load_data_from_directory(directory, labels_df):
    """Load text data from the directory and map labels."""
    texts = []
    labels = []

    # Create a mapping of file_id to label
    label_map = {row['file_id']: row['label'] for index, row in labels_df.iterrows()}

    # Loop through each file in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):  # Check if the file is a .txt file
            file_id = filename.split('.')[0]  
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                text = file.read().strip()  # Read the text from the file
                texts.append(text)

                # Assign label based on file_id
                if file_id in label_map:
                    labels.append(label_map[file_id])
                else:
                    continue  # Skip files that don't have a corresponding label

    return texts, labels

# Load labels from the CSV file
labels_csv_path = '../data/annotations_metadata.csv'  
labels_df = load_labels_from_csv(labels_csv_path)

# Load training data
train_directory = '../data/sampled_train'  
train_texts, train_labels = load_data_from_directory(train_directory, labels_df)

# Load testing data
test_directory = '../data/sampled_test'  
test_texts, test_labels = load_data_from_directory(test_directory, labels_df)

In [4]:
len(train_texts)

1914

In [5]:
Counter(train_labels)

Counter({'noHate': 957, 'hate': 957})

The dataset is balanced so wouldn't have to take any extra steps for a balanced data.

In [6]:
len(test_texts)

478

## Preprocessing Data

In [7]:
import tensorflow as tf

print(tf.__version__)

2024-10-17 13:22:55.646152: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


KeyboardInterrupt: 

In [30]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shruti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/shruti/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [31]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


def preprocess_text(text, use_lemmatization=True):
    """Preprocess the input text."""
    # Lowercase the text
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stop_words]
    
    # Initialize lemmatizer 
    lemmatizer = WordNetLemmatizer()
    
    # Apply lemmatization
    if use_lemmatization:
        words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

def preprocess_data(texts, use_lemmatization=True):
    """Preprocess a list of texts."""
    return [preprocess_text(text, use_lemmatization) for text in texts]


train_texts_preprocessed = preprocess_data(train_texts, use_lemmatization=True)
test_texts_preprocessed = preprocess_data(test_texts, use_lemmatization=True)

## Building the classification model

In [43]:
import torch
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_texts_preprocessed).toarray()
y_train = train_labels  

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor([1 if label == 'hate' else 0 for label in y_train], dtype=torch.long)


X_test = vectorizer.transform(test_texts_preprocessed).toarray()
y_test = test_labels  # Corresponding labels
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor([1 if label == 'hate' else 0 for label in y_test], dtype=torch.long)

## Creating models

### Feedforward Neural Network (FNN)

In [44]:
import torch.nn as nn

class TextClassifierFNN(nn.Module):
    def __init__(self, input_size):
        super(TextClassifierFNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 2)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Instantiate the FNN model
input_size = X_train.shape[1]
model_fnn = TextClassifierFNN(input_size)

### Convolutional Neural Network


In [45]:
class TextClassifierCNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(TextClassifierCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=3)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.fc1 = nn.Linear(32 * ((input_size - 3 + 1) // 2), 64)  # Adjusted input size after pooling
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add a channel dimension
        x = self.conv1(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)  # Flatten the output
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


model_cnn = TextClassifierCNN(input_size, 2)

### Recurrent Neural Network using LSTM

In [46]:
class TextClassifierLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(TextClassifierLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        x = x.unsqueeze(1)  # Add a sequence dimension
        lstm_out, _ = self.lstm(x)
        x = lstm_out[:, -1, :]  # Get the output of the last time step
        x = self.fc1(x)
        return x

# Instantiate the LSTM model
model_lstm = TextClassifierLSTM(input_size, hidden_size=64, num_classes=2)

## Training Phase

In [47]:
import torch.optim as optim

def train_model(model, X_train_tensor, y_train_tensor, num_epochs=10, learning_rate=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()
        
        if (epoch + 1) % 2 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Train all models
print("Training Feedforward Neural Network:")
train_model(model_fnn, X_train_tensor, y_train_tensor)

print("Training Convolutional Neural Network:")
train_model(model_cnn, X_train_tensor, y_train_tensor)

print("Training LSTM Neural Network:")
train_model(model_lstm, X_train_tensor, y_train_tensor)

Training Feedforward Neural Network:
Epoch [2/10], Loss: 0.6889
Epoch [4/10], Loss: 0.6761
Epoch [6/10], Loss: 0.6620
Epoch [8/10], Loss: 0.6458
Epoch [10/10], Loss: 0.6279
Training Convolutional Neural Network:
Epoch [2/10], Loss: 7.6104
Epoch [4/10], Loss: 3.6241
Epoch [6/10], Loss: 2.3205
Epoch [8/10], Loss: 1.4040
Epoch [10/10], Loss: 1.1089
Training LSTM Neural Network:
Epoch [2/10], Loss: 0.6902
Epoch [4/10], Loss: 0.6828
Epoch [6/10], Loss: 0.6749
Epoch [8/10], Loss: 0.6666
Epoch [10/10], Loss: 0.6576


In [48]:
# Function to evaluate the model
def evaluate_model(model, X_val, y_val):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        outputs = model(X_val)
        _, predicted = torch.max(outputs.data, 1)
        accuracy = (predicted == y_val).sum().item() / y_val.size(0)
    return accuracy

# Evaluate models
print("Evaluating Feedforward Neural Network:")
val_accuracy_fnn = evaluate_model(model_fnn, X_val_tensor, y_val_tensor)
print(f'Validation Accuracy: {val_accuracy_fnn:.4f}')

print("Evaluating Convolutional Neural Network:")
val_accuracy_cnn = evaluate_model(model_cnn, X_val_tensor, y_val_tensor)
print(f'Validation Accuracy: {val_accuracy_cnn:.4f}')

print("Evaluating LSTM Neural Network:")
val_accuracy_lstm = evaluate_model(model_lstm, X_val_tensor, y_val_tensor)
print(f'Validation Accuracy: {val_accuracy_lstm:.4f}')

Evaluating Feedforward Neural Network:
Validation Accuracy: 0.9347
Evaluating Convolutional Neural Network:
Validation Accuracy: 0.4386
Evaluating LSTM Neural Network:
Validation Accuracy: 0.8642
