In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
file_path = "..../balanced_fraud_metadata.csv"
df = pd.read_csv(file_path)

# Drop missing values (if any)
df = df.dropna()

# Split Dataset
X_metadata = df.drop(columns=["fraud_label"])
y = df["fraud_label"]

# Convert message text into numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words="english", ngram_range=(1,2))
X_text = tfidf_vectorizer.fit_transform(df["message"])  # Convert text to numerical form

# Standardize metadata features (except message)
scaler = StandardScaler()
X_metadata_scaled = scaler.fit_transform(X_metadata.drop(columns=["message"]))  # Scale numerical features

# Combine metadata + text features
X_combined = hstack([X_metadata_scaled, X_text])  # Merge numerical and text features

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [None]:
# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"✅ Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")