In [1]:
!pip install kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [2]:
!kaggle datasets download -d saurabhshahane/cyberbullying-dataset

Dataset URL: https://www.kaggle.com/datasets/saurabhshahane/cyberbullying-dataset
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading cyberbullying-dataset.zip to /content
 97% 63.0M/64.9M [00:00<00:00, 126MB/s]
100% 64.9M/64.9M [00:00<00:00, 128MB/s]


In [3]:
!unzip -q cyberbullying-dataset.zip

In [11]:
!pip install nltk torch torchtext

import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    else:
        return ""

def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(lemmatized_tokens)

# Tokenizer function for TorchText
tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)


In [13]:
racism_df = pd.read_csv('twitter_racism_parsed_dataset.csv')
aggression_df = pd.read_csv('aggression_parsed_dataset.csv')
toxicity_df = pd.read_csv('toxicity_parsed_dataset.csv')
sexism_df = pd.read_csv('twitter_sexism_parsed_dataset.csv')
youtube_df = pd.read_csv('youtube_parsed_dataset.csv')

racism_df = racism_df.dropna(subset=['oh_label'])
aggression_df = aggression_df.dropna(subset=['oh_label'])
toxicity_df = toxicity_df.dropna(subset=['oh_label'])
sexism_df = sexism_df.dropna(subset=['oh_label'])
youtube_df = youtube_df.dropna(subset=['oh_label'])

racism_df['Text'] = racism_df['Text'].apply(lambda x: tokenize_and_lemmatize(clean_text(x)))
aggression_df['Text'] = aggression_df['Text'].apply(lambda x: tokenize_and_lemmatize(clean_text(x)))
toxicity_df['Text'] = toxicity_df['Text'].apply(lambda x: tokenize_and_lemmatize(clean_text(x)))
sexism_df['Text'] = sexism_df['Text'].apply(lambda x: tokenize_and_lemmatize(clean_text(x)))
youtube_df['Text'] = youtube_df['Text'].apply(lambda x: tokenize_and_lemmatize(clean_text(x)))

combined_df = pd.concat([racism_df, aggression_df, toxicity_df, sexism_df, youtube_df])
combined_df.to_csv('combined_cyberbullying_dataset.csv', index=False)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)
train_df, test_df = train_test_split(combined_df, test_size=0.2, random_state=42)

In [14]:
vocab = build_vocab_from_iterator(yield_tokens(train_df['Text']), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

def encode_text(text, vocab, max_length):
    token_ids = [vocab[token] for token in tokenizer(text)]
    if len(token_ids) < max_length:
        token_ids = token_ids + [0] * (max_length - len(token_ids))
    else:
        token_ids = token_ids[:max_length]
    return token_ids

max_length = 128
train_df['encoded_text'] = train_df['Text'].apply(lambda x: encode_text(x, vocab, max_length))
test_df['encoded_text'] = test_df['Text'].apply(lambda x: encode_text(x, vocab, max_length))

class CyberbullyingDataset(Dataset):
    def __init__(self, encoded_texts, labels):
        self.encoded_texts = encoded_texts
        self.labels = labels

    def __getitem__(self, idx):
        return torch.tensor(self.encoded_texts[idx]), torch.tensor(self.labels[idx])

    def __len__(self):
        return len(self.labels)

train_dataset = CyberbullyingDataset(train_df['encoded_text'].tolist(), train_df['oh_label'].tolist())
test_dataset = CyberbullyingDataset(test_df['encoded_text'].tolist(), test_df['oh_label'].tolist())

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [15]:
class AttentionLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(AttentionLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.attention = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, 1)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_output, (hidden, cell) = self.lstm(embedded)
        attn_weights = torch.softmax(self.attention(lstm_output), dim=1)
        context_vector = torch.sum(attn_weights * lstm_output, dim=1)
        context_vector = self.dropout(context_vector)
        output = self.fc(context_vector)
        return output

vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 2
n_layers = 2
bidirectional = True
dropout = 0.5

model = AttentionLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device, dtype=torch.long)  # Ensure labels are of type Long
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

Epoch 1, Loss: 0.18599458871156016
Epoch 2, Loss: 0.18389375380458295
Epoch 3, Loss: 0.17434802802938940
Epoch 4, Loss: 0.17294575502984209
Epoch 5, Loss: 0.17144302384572745
Epoch 6, Loss: 0.15284765014896539
Epoch 7, Loss: 0.14724460465683989
Epoch 8, Loss: 0.14238479284579134
Epoch 9, Loss: 0.140658134531p4910
Epoch 10, Loss: 0.1394572945923564


In [4]:
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, preds = torch.max(outputs, 1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

Accuracy: 0.9439, Precision: 0.9502, Recall: 0.9854, F1 Score: 0.9674


In [19]:
torch.save(model.state_dict(), 'attention_lstm_model.pth')

In [20]:
model = AttentionLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout)
model.load_state_dict(torch.load('attention_lstm_model.pth'))
model = model.to(device)

In [29]:
def predict_cyberbullying(text):
    text_cleaned = clean_text(text)
    text_tokenized = tokenize_and_lemmatize(text_cleaned)
    token_ids = encode_text(text_tokenized, vocab, max_length)
    input_tensor = torch.tensor(token_ids).unsqueeze(0).to(device)
    model.eval()
    with torch.no_grad():
        output = model(input_tensor)
        _, prediction = torch.max(output, 1)
    return "Cyberbullying" if prediction.item() == 1 else "Not Cyberbullying"

new_text = "religion of hate"
prediction = predict_cyberbullying(new_text)
print(prediction)


Cyberbullying
