In [None]:
# Install required libraries
!pip install transformers datasets torch scikit-learn

# Import necessary libraries
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Load the dataset
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CyberguarAI-Hackathon/train.csv')

# Preprocess text
train_df['sub_category'].fillna('Unknown', inplace=True)
train_df.dropna(subset=['crimeaditionalinfo'], inplace=True)

# Check the number of instances per class
class_counts = train_df['sub_category'].value_counts()
print("Class counts:\n", class_counts)

# Filter out classes with fewer than 2 samples
valid_classes = class_counts[class_counts > 1].index
train_df_filtered = train_df[train_df['sub_category'].isin(valid_classes)]

# Optional: Combine small classes into an 'Other' category
# train_df_filtered['sub_category'] = train_df_filtered['sub_category'].apply(lambda x: x if class_counts[x] > 1 else 'Other')

# Define text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df_filtered['cleaned_info'] = train_df_filtered['crimeaditionalinfo'].apply(clean_text)

# Split the dataset into training and validation sets (stratified to preserve class proportions)
X_train, X_val, y_train, y_val = train_test_split(
    train_df_filtered['cleaned_info'],
    train_df_filtered['sub_category'],
    test_size=0.2,
    stratify=train_df_filtered['sub_category'],  # Stratify to preserve class proportions
    random_state=42
)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode the text
max_length = 128  # Adjust max length as needed

def encode_data(texts, tokenizer, max_length):
    return tokenizer(
        list(texts),
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors="pt"
    )

train_encodings = encode_data(X_train, tokenizer, max_length)
val_encodings = encode_data(X_val, tokenizer, max_length)

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = torch.tensor(label_encoder.fit_transform(y_train))
y_val_encoded = torch.tensor(label_encoder.transform(y_val))

# Create a custom dataset class
class CyberDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

train_dataset = CyberDataset(train_encodings, y_train_encoded)
val_dataset = CyberDataset(val_encodings, y_val_encoded)

# Load the BERT model for classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_encoder.classes_)
)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# DataLoader for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}: Loss = {total_loss / len(train_loader)}")

# Validation
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Classification report
print(classification_report(true_labels, predictions, target_names=label_encoder.classes_))

# Compute confusion matrix
cm = confusion_matrix(true_labels, predictions)

# Plot confusion matrix
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# Plot precision, recall, and F1-score for each class
metrics = precision_recall_fscore_support(true_labels, predictions, zero_division=1)
precision, recall, f1_score = metrics[:3]
classes = label_encoder.classes_

x = np.arange(len(classes))
width = 0.2

plt.figure(figsize=(15, 7))
plt.bar(x - width, precision, width, label='Precision', color='skyblue')
plt.bar(x, recall, width, label='Recall', color='orange')
plt.bar(x + width, f1_score, width, label='F1-Score', color='green')

plt.xticks(x, classes, rotation=90)
plt.xlabel('Classes')
plt.ylabel('Metrics')
plt.title('Precision, Recall, and F1-Score by Class')
plt.legend(loc='upper right')
plt.tight_layout()
plt.show()

# Plot class distribution in the dataset
class_counts = np.bincount(true_labels)
plt.figure(figsize=(12, 6))
plt.bar(classes, class_counts, color='purple')
plt.xticks(rotation=90)
plt.xlabel('Classes')
plt.ylabel('Number of Instances')
plt.title('Class Distribution in Dataset')
plt.show()




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['sub_category'].fillna('Unknown', inplace=True)


Class counts:
 sub_category
UPI Related Frauds                                                      26843
Other                                                                   10877
DebitCredit Card FraudSim Swap Fraud                                    10802
Internet Banking Related Fraud                                           8871
Unknown                                                                  6591
Fraud CallVishing                                                        5802
Cyber Bullying  Stalking  Sexting                                        4089
EWallet Related Fraud                                                    4047
FakeImpersonating Profile                                                2299
Profile Hacking Identity Theft                                           2072
Cheating by Impersonation                                                1987
Unauthorised AccessData Breach                                           1114
Online Job Fraud                    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_filtered['cleaned_info'] = train_df_filtered['crimeaditionalinfo'].apply(clean_text)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
