In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, RobertaTokenizer, DistilBertTokenizer
from transformers import BertForSequenceClassification, RobertaForSequenceClassification, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

import torch
from torch.utils.data import Dataset
import numpy as np

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [None]:
# Assuming the dataset is in CSV format and the path is "path/to/your/ScamDataset.csv"
dataset_path = 'ScamDataset'  # Change to your file path


# Preprocess messages
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
data = pd.read_csv('ScamDataset.csv', delimiter=',', names=['message', 'Label'])


# Displaying the shape of the dataset before removing duplicates
original_shape = data.shape

# Removing duplicate rows
data = data.drop_duplicates()

# Shape after removing duplicates
new_shape = data.shape

original_shape, new_shape

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text)
    stemmed = [stemmer.stem(word) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(stemmed)

data['processed_message'] = data['message'].apply(preprocess)
data['processed_message']


data['Label'] = data['Label'].map({'normal': 0, 'fraud': 1})

texts = data['processed_message'].tolist()
labels = data['Label'].tolist()
# Split the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)


In [None]:
class ScamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

# Define function for training and evaluation
def train_and_evaluate(model_name):
    print(f"Training and evaluating {model_name}...")
    if model_name == 'bert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    elif model_name == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
    elif model_name == 'distilbert':
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
   

    else:
        raise Exception("Model not supported")

    train_dataset = ScamDataset(train_texts, train_labels, tokenizer)
    val_dataset = ScamDataset(val_texts, val_labels, tokenizer)

    training_args = TrainingArguments(
        output_dir='./results',          
        learning_rate = 1e-4,
        per_device_train_batch_size = 8,
        per_device_eval_batch_size = 8,
        num_train_epochs = 2,
        weight_decay = 0.01,
        evaluation_strategy = 'epoch',
        save_strategy = 'epoch',
        load_best_model_at_end = True 
    )

    trainer = Trainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,         
        eval_dataset=val_dataset,  
        compute_metrics=compute_metrics,
    )

    trainer.train()

    return trainer.evaluate()



In [None]:
model_accuracies = {}
model_name = 'bert'

eval_results = train_and_evaluate(model_name)
model_accuracies[model_name] = eval_results['eval_accuracy']


In [None]:
model_name = 'roberta'

eval_results = train_and_evaluate(model_name)
model_accuracies[model_name] = eval_results['eval_accuracy']

In [None]:
model_name = 'distilbert'

eval_results = train_and_evaluate(model_name)
model_accuracies[model_name] = eval_results['eval_accuracy']

In [None]:
print('Model Accuracies:')
for model_name, accuracy in model_accuracies.items():
    print(f"{model_name}: {accuracy}")


In [None]:
import matplotlib.pyplot as plt
# Plotting the accuracies using a more distinct graph style
plt.figure(figsize=(10, 6))

# Create a bar graph with distinct colors and edge color
bars = plt.bar(model_accuracies.keys(), model_accuracies.values(), color=['blue', 'green', 'red'], edgecolor='black')

# Add data labels above each bar for clarity
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.001, round(yval, 3), ha='center', va='bottom')

plt.xlabel('Model', fontsize=14)
plt.ylabel('Accuracy', fontsize=14)
plt.title('Comparison of Model Accuracies on scam Detection', fontsize=16)
plt.ylim(0.99, 1)  # Set y-axis limit to make differences more distinct
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()