Training Code for NLP Cybercrime Classification Model

In [None]:
import pandas as pd
import torch
import re
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, log_loss, balanced_accuracy_score, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import nltk
from nltk.corpus import wordnet
import random
import numpy as np

Text preprocessing and resampling of minority class

In [None]:
device = torch.device("cuda")

# Load dataset
data = pd.read_csv('updated_data_train.csv')

# Encode labels for category and subcategory
category_encoder = LabelEncoder()
subcategory_encoder = LabelEncoder()
data['category'] = category_encoder.fit_transform(data['category'])
data['sub-category'] = subcategory_encoder.fit_transform(data['sub_category'])

# Combine category and subcategory into a single label for resampling
data['combined_label'] = data['category'].astype(str) + "_" + data['sub-category'].astype(str)

# Encode the combined labels
combined_encoder = LabelEncoder()
data['combined_label'] = combined_encoder.fit_transform(data['combined_label'])

# Separate features (X) and target (y)
X = data['processed_text']
y = data['combined_label']

# Function to remove URLs, email addresses, and phone numbers
def clean_text(text):
    text = re.sub(r'http\S+|www\S+', '', text)                 # Remove URLs
    text = re.sub(r'\S+@\S+', '', text)                        # Remove email addresses
    text = re.sub(r'\b\d{10}\b|\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b', '', text)  # Remove phone numbers
    return text

# Apply the clean_text function to the 'text' column
X = X.apply(lambda x: clean_text(str(x)))

# Resample to balance the classes
oversample = RandomOverSampler(sampling_strategy='minority')
X_resampled, y_resampled = oversample.fit_resample(X.values.reshape(-1, 1), y)

# Convert resampled data back to original format
X_resampled = X_resampled.flatten()
y_resampled = combined_encoder.inverse_transform(y_resampled)

# Split the combined labels back into category and subcategory
y_resampled_category, y_resampled_subcategory = zip(*[map(int, label.split("_")) for label in y_resampled])

Data Augmentation to handle class imbalance

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

# Synonym replacement function
def synonym_replacement(sentence, n=1):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    random.shuffle(random_word_list)

    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:  # Replace only 'n' words
            break

    sentence = ' '.join(new_words)
    return sentence

# Applying synonym replacement to increase the dataset size
augmented_texts = []
augmented_labels = []

for text, (cat_label, subcat_label) in zip(X_resampled, zip(y_resampled_category, y_resampled_subcategory)):
    # Add the original text and label
    augmented_texts.append(text)
    augmented_labels.append((cat_label, subcat_label))

    # Generate additional augmented samples
    for _ in range(3):  # Adding 3 variations for each sample
        augmented_texts.append(synonym_replacement(text, n=2))
        augmented_labels.append((cat_label, subcat_label))

# Convert lists back to numpy arrays
X_augmented = np.array(augmented_texts)
y_augmented = np.array(augmented_labels)

# Continue with train-test split using augmented data
train_texts, test_val_texts, train_labels, test_val_labels = train_test_split(
    X_augmented,
    y_augmented,
    test_size=0.4,
    random_state=42
)

# Split test and validation sets
test_texts, val_texts, test_labels, val_labels = train_test_split(
    test_val_texts,
    test_val_labels,
    test_size=0.5,
    random_state=42
)

# Set up BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        category_label, subcategory_label = self.labels[idx]

        # Tokenize the text
        inputs = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        inputs = {key: val.squeeze(0) for key, val in inputs.items() if key in ['input_ids', 'attention_mask']}
        return inputs, torch.tensor(category_label), torch.tensor(subcategory_label)



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Model Definition and Training Code

In [None]:
# Create DataLoaders
train_dataset = TextDataset(train_texts, train_labels)
val_dataset = TextDataset(val_texts, val_labels)
test_dataset = TextDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

# Define the model
class MultiOutputBERT(nn.Module):
    def __init__(self, num_categories, num_subcategories):
        super(MultiOutputBERT, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.category_classifier = nn.Linear(self.bert.config.hidden_size, num_categories)
        self.subcategory_classifier = nn.Linear(self.bert.config.hidden_size, num_subcategories)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(outputs.pooler_output)
        category_logits = self.category_classifier(pooled_output)
        subcategory_logits = self.subcategory_classifier(pooled_output)
        return category_logits, subcategory_logits

# Initialize model, loss, and optimizer
num_categories = len(category_encoder.classes_)
num_subcategories = len(subcategory_encoder.classes_)
model = MultiOutputBERT(num_categories, num_subcategories)
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Training and validation loop
epochs = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs, category_labels, subcategory_labels = batch
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        category_labels = category_labels.to(device)
        subcategory_labels = subcategory_labels.to(device)

        optimizer.zero_grad()

        category_logits, subcategory_logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss1 = criterion(category_logits, category_labels)
        loss2 = criterion(subcategory_logits, subcategory_labels)
        loss = loss1 + loss2
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Training Loss: {total_loss / len(train_loader)}")

torch.save(model, "model.pt")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



Epoch 1, Training Loss: 1.5853315856086354
Epoch 2, Training Loss: 1.2166699695430965


Classification Results for 20% validation set out of 60-20-20 train-test-validate split

In [None]:
model = torch.load("/content/meow/model.pt")
# Validation phase
model.eval()
category_preds = []
subcategory_preds = []
category_probs = []
subcategory_probs = []
category_true = []
subcategory_true = []

with torch.no_grad():
    for batch in val_loader:
        inputs, category_labels, subcategory_labels = batch
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        category_labels = category_labels.to(device)
        subcategory_labels = subcategory_labels.to(device)

        category_logits, subcategory_logits = model(input_ids=input_ids, attention_mask=attention_mask)

        # Store probabilities and predictions
        category_probs.extend(torch.softmax(category_logits, dim=1).cpu().numpy())
        subcategory_probs.extend(torch.softmax(subcategory_logits, dim=1).cpu().numpy())
        category_preds.extend(torch.argmax(category_logits, dim=1).cpu().numpy())
        subcategory_preds.extend(torch.argmax(subcategory_logits, dim=1).cpu().numpy())
        category_true.extend(category_labels.cpu().numpy())
        subcategory_true.extend(subcategory_labels.cpu().numpy())

# Calculate validation metrics for categories and subcategories
category_accuracy = accuracy_score(category_true, category_preds)
subcategory_accuracy = accuracy_score(subcategory_true, subcategory_preds)
category_f1 = f1_score(category_true, category_preds, average='weighted', zero_division=1)
subcategory_f1 = f1_score(subcategory_true, subcategory_preds, average='weighted', zero_division=1)
category_precision = precision_score(category_true, category_preds, average='weighted', zero_division=1)
subcategory_precision = precision_score(subcategory_true, subcategory_preds, average='weighted', zero_division=1)
category_recall = recall_score(category_true, category_preds, average='weighted', zero_division=1)
subcategory_recall = recall_score(subcategory_true, subcategory_preds, average='weighted', zero_division=1)

# Log Loss
category_log_loss = log_loss(category_true, category_probs)
subcategory_log_loss = log_loss(subcategory_true, subcategory_probs)

# AUC-ROC
category_auc_roc = roc_auc_score(category_true, category_probs, multi_class='ovr')
subcategory_auc_roc = roc_auc_score(subcategory_true, subcategory_probs, multi_class='ovr')

# Balanced Accuracy
category_balanced_accuracy = balanced_accuracy_score(category_true, category_preds)
subcategory_balanced_accuracy = balanced_accuracy_score(subcategory_true, subcategory_preds)

# Confusion Matrices
category_conf_matrix = confusion_matrix(category_true, category_preds)
subcategory_conf_matrix = confusion_matrix(subcategory_true, subcategory_preds)

print(f"Validation - Category Accuracy: {category_accuracy}, F1: {category_f1}, Precision: {category_precision}, Recall: {category_recall}")
print(f"Validation - Subcategory Accuracy: {subcategory_accuracy}, F1: {subcategory_f1}, Precision: {subcategory_precision}, Recall: {subcategory_recall}")
print(f"Log Loss - Category: {category_log_loss}, Subcategory: {subcategory_log_loss}")
print(f"AUC-ROC - Category: {category_auc_roc}, Subcategory: {subcategory_auc_roc}")
print(f"Balanced Accuracy - Category: {category_balanced_accuracy}, Subcategory: {subcategory_balanced_accuracy}")
print("Confusion Matrix - Category:\n", category_conf_matrix)
print("Confusion Matrix - Subcategory:\n", subcategory_conf_matrix)


  model = torch.load("/content/meow/model.pt")


Validation - Category Accuracy: 0.8985849056603774, F1: 0.8925173797105401, Precision: 0.8938477911289812, Recall: 0.8985849056603774
Validation - Subcategory Accuracy: 0.7377577885037298, F1: 0.7198973416175383, Precision: 0.7542550760035994, Recall: 0.7377577885037298
Log Loss - Category: 0.3014290409493987, Subcategory: 0.8420727281035051
AUC-ROC - Category: 0.9586929839183089, Subcategory: 0.9590939493587596
Balanced Accuracy - Category: 0.5428713154228575, Subcategory: 0.3049559286814776
Confusion Matrix - Category:
 [[ 4891    18     0     0   152     0  2716     5   924     0     2]
 [   21   279     0     0     0     0    65     0     6     0     0]
 [    0     0  2817     0     0     0     0     0     0     0     0]
 [   54     0     0     0    13     0    35     0    27     0     0]
 [  135     0     0     0   849     0   200     0   213     2     0]
 [   21     2     0     0     6     0    70     1    50     0     0]
 [ 1144    22     0     0   102     0 44137     4   487   

In [None]:
print(model)

MultiOutputBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

Classification Results for 20% test set out of 60-20-20 train-test-validate split

In [None]:
model = torch.load("/content/meow/model.pt")
# Validation phase
model.eval()
category_preds = []
subcategory_preds = []
category_probs = []
subcategory_probs = []
category_true = []
subcategory_true = []

with torch.no_grad():
    for batch in test_loader:
        inputs, category_labels, subcategory_labels = batch
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        category_labels = category_labels.to(device)
        subcategory_labels = subcategory_labels.to(device)

        category_logits, subcategory_logits = model(input_ids=input_ids, attention_mask=attention_mask)

        # Store probabilities and predictions
        category_probs.extend(torch.softmax(category_logits, dim=1).cpu().numpy())
        subcategory_probs.extend(torch.softmax(subcategory_logits, dim=1).cpu().numpy())
        category_preds.extend(torch.argmax(category_logits, dim=1).cpu().numpy())
        subcategory_preds.extend(torch.argmax(subcategory_logits, dim=1).cpu().numpy())
        category_true.extend(category_labels.cpu().numpy())
        subcategory_true.extend(subcategory_labels.cpu().numpy())

# Calculate validation metrics for categories and subcategories
category_accuracy = accuracy_score(category_true, category_preds)
subcategory_accuracy = accuracy_score(subcategory_true, subcategory_preds)
category_f1 = f1_score(category_true, category_preds, average='weighted', zero_division=1)
subcategory_f1 = f1_score(subcategory_true, subcategory_preds, average='weighted', zero_division=1)
category_precision = precision_score(category_true, category_preds, average='weighted', zero_division=1)
subcategory_precision = precision_score(subcategory_true, subcategory_preds, average='weighted', zero_division=1)
category_recall = recall_score(category_true, category_preds, average='weighted', zero_division=1)
subcategory_recall = recall_score(subcategory_true, subcategory_preds, average='weighted', zero_division=1)

# Log Loss
category_log_loss = log_loss(category_true, category_probs)
subcategory_log_loss = log_loss(subcategory_true, subcategory_probs)

# AUC-ROC
category_auc_roc = roc_auc_score(category_true, category_probs, multi_class='ovr')
subcategory_auc_roc = roc_auc_score(subcategory_true, subcategory_probs, multi_class='ovr')

# Balanced Accuracy
category_balanced_accuracy = balanced_accuracy_score(category_true, category_preds)
subcategory_balanced_accuracy = balanced_accuracy_score(subcategory_true, subcategory_preds)

# Confusion Matrices
category_conf_matrix = confusion_matrix(category_true, category_preds)
subcategory_conf_matrix = confusion_matrix(subcategory_true, subcategory_preds)

print(f"Validation - Category Accuracy: {category_accuracy}, F1: {category_f1}, Precision: {category_precision}, Recall: {category_recall}")
print(f"Validation - Subcategory Accuracy: {subcategory_accuracy}, F1: {subcategory_f1}, Precision: {subcategory_precision}, Recall: {subcategory_recall}")
print(f"Log Loss - Category: {category_log_loss}, Subcategory: {subcategory_log_loss}")
print(f"AUC-ROC - Category: {category_auc_roc}, Subcategory: {subcategory_auc_roc}")
print(f"Balanced Accuracy - Category: {category_balanced_accuracy}, Subcategory: {subcategory_balanced_accuracy}")
print("Confusion Matrix - Category:\n", category_conf_matrix)
print("Confusion Matrix - Subcategory:\n", subcategory_conf_matrix)


  model = torch.load("/content/meow/model.pt")


Validation - Category Accuracy: 0.898376480912681, F1: 0.8925044351259779, Precision: 0.8938470296442621, Recall: 0.898376480912681
Validation - Subcategory Accuracy: 0.7349714787187362, F1: 0.7180039240586884, Precision: 0.7521346032521543, Recall: 0.7349714787187362
Log Loss - Category: 0.3032438259251776, Subcategory: 0.8460257920595695
AUC-ROC - Category: 0.9554211161874591, Subcategory: 0.9583408718404176
Balanced Accuracy - Category: 0.5512910301171433, Subcategory: 0.30660868940880953
Confusion Matrix - Category:
 [[ 4966    19     0     0   172     0  2744     0   873     0     1]
 [   17   302     0     0     0     0    68     2     5     0     0]
 [    0     0  2892     0     0     0     0     0     0     0     0]
 [   47     0     0     0    14     0    29     1    32     0     0]
 [  135     1     0     0   800     0   197     0   221     2     0]
 [   20     3     0     0    11     0    69     0    44     0     0]
 [ 1137    31     0     0   122     0 44201     9   487    