In [None]:
!pip install torch torchvision torchaudio
!pip install scikit-learn
!pip install nltk




In [None]:
# Import Libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from collections import Counter
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# 1. Load Dataset
# Upload your CSV file or replace path with your file
from google.colab import files
uploaded = files.upload()

df = pd.read_csv(list(uploaded.keys())[0])
print(df.head())


Saving test.csv to test.csv
                                    category  \
0  RapeGang Rape RGRSexually Abusive Content   
1                     Online Financial Fraud   
2             Cyber Attack/ Dependent Crimes   
3                     Online Financial Fraud   
4                      Any Other Cyber Crime   

                           sub_category  \
0                                   NaN   
1  DebitCredit Card FraudSim Swap Fraud   
2                         SQL Injection   
3                     Fraud CallVishing   
4                                 Other   

                                  crimeaditionalinfo  
0  Sir namaskar  mein Ranjit Kumar PatraPaise neh...  
1          KOTAK MAHINDRA BANK FRAUD\r\nFRAUD AMOUNT  
2  The issue actually started when I got this ema...  
3  I am amit kumar from karwi chitrakoot I am tot...  
4  I have ordered  saree and  blouse from rinki s...  


In [None]:
# 2. Preprocess Data
texts = df['crimeaditionalinfo'].astype(str).tolist()
labels = df['category'].tolist()


In [None]:
# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)


In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Tokenize
tokenized_texts = [word_tokenize(text.lower()) for text in texts]


In [None]:
# Build Vocab
all_words = [word for text in tokenized_texts for word in text]
vocab = Counter(all_words)
vocab = {word: idx+2 for idx, (word, _) in enumerate(vocab.most_common())}  # +2 for PAD and UNK
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1


In [None]:
# Encode Text
def encode(text):
    return [vocab.get(word, vocab['<UNK>']) for word in text]

encoded_texts = [encode(text) for text in tokenized_texts]

In [None]:
# Pad Sequences
max_len = 100  # you can adjust
def pad(seq, max_len):
    if len(seq) < max_len:
        return seq + [0] * (max_len - len(seq))
    else:
        return seq[:max_len]

padded_texts = [pad(seq, max_len) for seq in encoded_texts]


In [None]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(padded_texts, labels, test_size=0.2, random_state=42)


In [None]:
# Convert to Tensor
X_train = torch.tensor(X_train)
X_test = torch.tensor(X_test)
y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)


In [None]:
# 3. Create Dataset and Dataloader
class CybercrimeDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = CybercrimeDataset(X_train, y_train)
test_dataset = CybercrimeDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
# 4. Build TextCNN Model
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, kernel_sizes=[3,4,5], num_filters=100):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embed_dim)) for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, x):
        x = self.embedding(x)   # (batch, seq_len, embed_dim)
        x = x.unsqueeze(1)       # (batch, 1, seq_len, embed_dim)
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [torch.max(pool, dim=2)[0] for pool in x]
        x = torch.cat(x, dim=1)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits

In [None]:
# Instantiate Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = TextCNN(vocab_size=len(vocab), embed_dim=100, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# 5. Train Model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}, Accuracy: {100*correct/total:.2f}%')

Epoch 1/10, Loss: 0.9893, Accuracy: 68.38%
Epoch 2/10, Loss: 0.8493, Accuracy: 71.86%
Epoch 3/10, Loss: 0.8036, Accuracy: 73.21%
Epoch 4/10, Loss: 0.7416, Accuracy: 74.96%
Epoch 5/10, Loss: 0.6786, Accuracy: 76.57%
Epoch 6/10, Loss: 0.6021, Accuracy: 79.47%
Epoch 7/10, Loss: 0.5225, Accuracy: 81.94%
Epoch 8/10, Loss: 0.4425, Accuracy: 84.89%
Epoch 9/10, Loss: 0.3649, Accuracy: 87.68%
Epoch 10/10, Loss: 0.3093, Accuracy: 89.48%


In [None]:
# 6. Evaluate Model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

print(f'Test Accuracy: {100*correct/total:.2f}%')

# Save the Label Encoder if needed
import pickle
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)


Test Accuracy: 72.53%


In [None]:
# 5. Train Model
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}, Accuracy: {100*correct/total:.2f}%')

Epoch 1/5, Loss: 0.9914, Accuracy: 68.49%
Epoch 2/5, Loss: 0.8477, Accuracy: 71.93%
Epoch 3/5, Loss: 0.7916, Accuracy: 73.29%
Epoch 4/5, Loss: 0.7334, Accuracy: 74.92%
Epoch 5/5, Loss: 0.6612, Accuracy: 77.32%


In [None]:
# 6. Evaluate Model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

print(f'Test Accuracy: {100*correct/total:.2f}%')

Test Accuracy: 74.03%


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# 6. Evaluate Model
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # fix the bug
        outputs = model(X_batch)
        _, predicted = torch.max(outputs.data, 1)

        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

test_accuracy = 100 * correct / total
print(f'Test Accuracy: {test_accuracy:.2f}%')

# Calculate Precision, Recall, F1-Score
precision = precision_score(all_labels, all_preds, average='weighted')
recall = recall_score(all_labels, all_preds, average='weighted')
f1 = f1_score(all_labels, all_preds, average='weighted')

print(f'Precision: {precision:.4f}')
print(f'Recall:    {recall:.4f}')
print(f'F1 Score:  {f1:.4f}')


Test Accuracy: 74.03%
Precision: 0.7112
Recall:    0.7403
F1 Score:  0.7120


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.metrics import classification_report

# After prediction step
print(classification_report(all_labels, all_preds, target_names=labels))


ValueError: Number of classes, 15, does not match size of target_names, 31229. Try specifying the labels parameter

In [None]:
from sklearn.metrics import classification_report

# Get the unique class names from the label encoder
target_names = label_encoder.classes_.tolist()

# After prediction step
print(classification_report(all_labels, all_preds, target_names=target_names))

                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.32      0.27      0.29       687
Child Pornography CPChild Sexual Abuse Material CSAM       0.67      0.15      0.25        26
                      Crime Against Women & Children       0.00      0.00      0.00         2
                                Cryptocurrency Crime       0.55      0.19      0.29        31
                      Cyber Attack/ Dependent Crimes       0.99      1.00      1.00       267
                                     Cyber Terrorism       0.00      0.00      0.00        15
      Hacking  Damage to computercomputer system etc       0.45      0.04      0.07       126
                            Online Cyber Trafficking       0.00      0.00      0.00        12
                              Online Financial Fraud       0.83      0.91      0.87      3787
                            Online Gambling  Betting       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
