In [1]:
!pip install pandas transformers datasets torch scikit-learn




In [2]:
import pandas as pd

# Load the dataset, explicitly specifying the encoding
file_path = "/content/dataset.csv"  # Update with your uploaded file's name
try:
    df = pd.read_csv(file_path, names=["text", "label"], on_bad_lines="skip", encoding='utf-8')
except UnicodeDecodeError:
    try:
        # If UTF-8 fails, try 'latin-1' (or 'ISO-8859-1')
        df = pd.read_csv(file_path, names=["text", "label"], on_bad_lines="skip", encoding='latin-1')
    except UnicodeDecodeError:
        # If 'latin-1' fails, try 'cp1252' (Windows encoding)
        df = pd.read_csv(file_path, names=["text", "label"], on_bad_lines="skip", encoding='cp1252')


# Display the first few rows
print(df.head())

# Preprocess the data
df = df.dropna()  # Drop rows with missing values
df["text"] = df["text"].astype(str)  # Ensure 'text' is a string
df["label"] = df["label"].astype(str)  # Ensure 'label' is a string

                                                text                label
0                                   Case Description  Cybercrime Category
1  I received an email asking for bank details, a...             Phishing
2  મને આ રીતે ઈમેલ મળ્યો કે મારી એકાઉન્ટ ખોટી રીત...             Phishing
3  मुझे बैंक से जुड़ा ईमेल आया और मेरी डिटेल चुरा...             Phishing
4        আমার ব্যাঙ্ক অ্যাকাউন্টের বিশদ চুরি হয়েছে।             Phishing


In [3]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])

# Save the mapping for future use
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Label Mapping:", label_mapping)


Label Mapping: {'Adware Attack': 0, 'Bank Loan Fraud': 1, 'Business Email Compromise (BEC)': 2, 'Carding': 3, 'Cryptojacking': 4, 'Cyberbullying': 5, 'Cybercrime Category': 6, 'DDoS Attack': 7, 'Data Breach': 8, 'Digital Kidnapping': 9, 'Fraud': 10, 'Identity Theft': 11, 'Insider Threat': 12, 'IoT Attack': 13, 'Malware Attack': 14, 'Phis    hing': 15, 'Phishing': 16, 'Ransomware': 17, 'Social Engineering': 18, 'Spyware Attack': 19, 'Student Loan Fraud': 20}


In [None]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokenize the dataset
def tokenize_data(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

tokenized_data = tokenize_data(df["text"].tolist())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from sklearn.model_selection import train_test_split
import torch

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    tokenized_data["input_ids"], df["label_encoded"], test_size=0.2, random_state=42
)

# Convert to PyTorch tensors
train_inputs = torch.stack(X_train)
train_labels = torch.tensor(y_train.values)
test_inputs = torch.stack(X_test)
test_labels = torch.tensor(y_test.values)


In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_mapping))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Define Trainer
from datasets import Dataset
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    logits, labels = pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

train_dataset = Dataset.from_dict({"input_ids": train_inputs.tolist(), "labels": train_labels.tolist()})
test_dataset = Dataset.from_dict({"input_ids": test_inputs.tolist(), "labels": test_labels.tolist()})

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


In [None]:
model.save_pretrained("./cybercrime_model")
tokenizer.save_pretrained("./cybercrime_model")
