In [9]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix, classification_report

In [14]:
dftrans = pd.read_csv('BETTER30.csv')

stop_words = set(stopwords.words('english'))
wnl = WordNetLemmatizer()

def clean_text(text):
    if isinstance(text, float):
        return ""  
    text = str(text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = text.lower()
    tokens = text.split(" ")
    cleaned = [wnl.lemmatize(token) for token in tokens if token not in stop_words]
    return " ".join(cleaned)

scam_labels = [
    'scam', 'suspicious', 'highly_suspicious', 'slightly_suspicious', 'potential_scam',
    'scam_response', 'citing urgency', 'suggesting a dangerous situation', 'dismissive official protocols'
]
non_scam_labels = [
    'neutral', 'legitimate', 'standard_opening', 'identification_request', 'polite_ending',
    'adhering to protocols', 'emphasizing security and compliance', 'ready for further engagement'
]

def categorize_label(label):
    if label in scam_labels:
        return 0
    elif label in non_scam_labels:
        return 1
    else:
        return None 

dftrans['CONTEXT'] = dftrans['CONTEXT'].fillna('unknown')
dftrans['FEATURES'] = dftrans['FEATURES'].fillna('unknown')
dftrans.dropna(subset=['TEXT', 'LABEL'], inplace=True)

if 'ANNOTATIONS' in dftrans.columns:
    dftrans.drop(columns=['ANNOTATIONS'], inplace=True)

dftrans['TEXT'] = dftrans['TEXT'].apply(clean_text)
dftrans['CONTEXT'] = dftrans['CONTEXT'].apply(clean_text)

dftrans['LABEL'] = dftrans['LABEL'].apply(categorize_label)
dftrans.dropna(subset=['LABEL'], inplace=True)

dftrans['COMBINED'] = dftrans['TEXT'] + " " + dftrans['CONTEXT']

X = dftrans['COMBINED']
y = dftrans['LABEL']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = RandomForestClassifier(n_estimators=250, random_state=42)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(report)

Accuracy: 0.9117647058823529
F1 Score: 0.9285714285714286
Recall: 0.9285714285714286
Confusion Matrix:
[[23  3]
 [ 3 39]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.88      0.88        26
         1.0       0.93      0.93      0.93        42

    accuracy                           0.91        68
   macro avg       0.91      0.91      0.91        68
weighted avg       0.91      0.91      0.91        68

