In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
from transformers import AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [None]:
# Load dataset using pandas
data_og = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Dataset for Detection of Cyber-Aggression.csv')  t

In [None]:
# Take only 200 rows
data = data_og.sample(n=2000, random_state=42)

In [None]:

# Extracting text data and labels from the DataFrame
texts = data['Text'].tolist()
labels = data['label'].tolist()

In [None]:
# Split your data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [None]:
# Load BERT tokenizer
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Tokenize the input texts
train_encodings = tokenizer_bert(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer_bert(test_texts, truncation=True, padding=True, max_length=128)


In [None]:
# Convert labels to tensors
train_labels = torch.tensor([1 if label == 'AG' else 0 for label in train_labels])
test_labels = torch.tensor([1 if label == 'AG' else 0 for label in test_labels])


In [None]:

# Creating TensorDataset for training and testing
# Training
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']), train_labels)


In [None]:
# Testing
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                             torch.tensor(test_encodings['attention_mask']),
                             test_labels)


In [None]:
# Loading BERT model for sequence classification
model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define optimizer
# Learning rate = 1e-5
optimizer = AdamW(model_bert.parameters(), lr=1e-5)




In [None]:
# Create DataLoader for training and testing
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)


In [None]:
# Training loop
model_bert.train()
for epoch in range(8):  # We can adjust the number of epochs
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model_bert(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
# Evaluation
model_bert.eval()
predictions = []
true_labels = []
for batch in test_loader:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = model_bert(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions.extend(torch.argmax(logits, axis=1).tolist())
    true_labels.extend(labels.tolist())



In [None]:

# Calculation for evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)


In [None]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.675
Precision: 0.8333333333333334
Recall: 0.29411764705882354
F1 Score: 0.4347826086956522


# RoBERTa-base

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd


In [None]:
import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Dataset for Detection of Cyber-Aggression.csv')


In [None]:
# Overview of dataset
data['label'].unique()

array(['AG', 'NAG'], dtype=object)

In [None]:
data.describe(include='all')

Unnamed: 0,Text,label
count,20001,20001
unique,14640,2
top,#NAME?,NAG
freq,38,12179


In [None]:
# Take only 200 rows
data_subset = data.sample(n=2000, random_state=42)

In [None]:
# Extract text data and labels
texts = data_subset['Text'].tolist()
labels = data_subset['label'].tolist()


In [None]:
# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [None]:
# Load RoBERTa tokenizer and encode text data
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
# Convert labels to tensors
train_labels = torch.tensor([1 if label == 'AG' else 0 for label in train_labels])
test_labels = torch.tensor([1 if label == 'AG' else 0 for label in test_labels])


In [None]:
# Create TensorDataset for train and test sets
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              train_labels)

In [None]:
#test Dataset
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                             torch.tensor(test_encodings['attention_mask']),
                             test_labels)


In [None]:
# Load RoBERTa model for sequence classification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)




In [None]:
# Create DataLoader for train and test sets
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4 , shuffle=False)


In [None]:
# Training loop
model.train()
for epoch in range(10):  # You can adjust the number of epochs
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


In [None]:

# Evaluation
model.eval()
predictions = []
true_labels = []
for batch in test_loader:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions.extend(torch.argmax(logits, axis=1).tolist())
    true_labels.extend(labels.tolist())


In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
conf_matrix = confusion_matrix(true_labels, predictions)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [None]:

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)

NameError: name 'accuracy' is not defined

# Inference : distilbert-base Model


In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd


In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Dataset for Detection of Cyber-Aggression.csv')

Unnamed: 0,Text,label
0,Get fucking real dude.,AG
1,She is as dirty as they come and that crook ...,AG
2,why did you fuck it up. I could do it all day...,AG
3,Dude they dont finish enclosing the fucking s...,AG
4,WTF are you talking about Men? No men thats n...,AG
...,...,...
19996,I dont. But what is complaining about it goi...,NAG
19997,Bahah yeah i&;m totally just gonna&; get pis...,NAG
19998,hahahahaha >:) im evil mwahahahahahahahahaha,NAG
19999,What&;s something unique about Ohio? :),NAG


In [None]:
# Take only 200 rows
data_subset = data.sample(n=200, random_state=42)

In [None]:
# Extract text data and labels
texts = data_subset['Text'].tolist()
labels = data_subset['label'].tolist()


In [None]:
# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [None]:
# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:

# Tokenize train and test texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


In [None]:

# Convert labels to tensors
train_labels = torch.tensor([1 if label == 'AG' else 0 for label in train_labels])
test_labels = torch.tensor([1 if label == 'AG' else 0 for label in test_labels])


In [None]:

# Create TensorDataset for train and test sets
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              train_labels)

In [None]:
# Test set
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                             torch.tensor(test_encodings['attention_mask']),
                             test_labels)


In [None]:

# Load DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)




In [None]:

# Create DataLoader for train and test sets
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [None]:

# Training loop
model.train()
for epoch in range(8):  # You can adjust the number of epochs
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
# Evaluation
model.eval()
predictions = []
true_labels = []
for batch in test_loader:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions.extend(torch.argmax(logits, axis=1).tolist())
    true_labels.extend(labels.tolist())


In [None]:
#  Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
conf_matrix = confusion_matrix(true_labels, predictions)


In [None]:
print("DistilBERT Model Performance:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)


DistilBERT Model Performance:
Accuracy: 0.775
Precision: 0.8333333333333334
Recall: 0.5882352941176471
F1 Score: 0.6896551724137931
Confusion Matrix:
[[21  2]
 [ 7 10]]
