In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from imblearn.over_sampling import SMOTE
import joblib
import os

In [2]:
df = pd.read_csv("Dataset/Base_with_identifiers.csv")
df["user_id"] = df.index

df.to_csv("Dataset/Base_with_identifiers.csv", index=False)

In [None]:
from graph_feature import load_graph, extract_single_user_features
import pandas as pd

G = load_graph("models/user_graph.gpickle")

user_nodes_df = pd.read_csv("Dataset/Base_with_identifiers.csv")[["user_id", "fraud_bool"]]
graph_feats = user_nodes_df.apply(lambda row: extract_single_user_features(G, row), axis=1)
graph_feats["user_id"] = user_nodes_df["user_id"]
base = pd.read_csv("Dataset/Base_with_identifiers.csv")
df_with_graph = pd.merge(base, graph_feats, on="user_id", how="left")

df_with_graph.to_csv("Dataset/Base_with_graph_features.csv", index=False)

In [None]:
df = pd.read_csv("Dataset/Base_with_graph_features.csv")
df.drop(columns=["user_id", "email", "device_id", "ip_address"], errors='ignore', inplace=True)

In [None]:
# Drop unnecessary columns
df.drop(columns=['employment_status','housing_status','payment_type'],inplace=True)

In [None]:
y = df["fraud_bool"]
x = df.drop(columns=["fraud_bool"])

In [None]:
cat_cols = ['device_os', 'source']
graph_cols = ['num_connections', 'num_shared_identifiers', 'fraud_neighbors', 'fraud_ratio_neighbors', 'component_size']
num_cols = [col for col in x.columns if col not in cat_cols + graph_cols]

In [None]:
# Encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
X_cat = encoder.fit_transform(x[cat_cols])
cat_encoded_df = pd.DataFrame(X_cat, columns=encoder.get_feature_names_out(cat_cols), index=x.index)

# Combine all
X_all = pd.concat([x[num_cols], cat_encoded_df, x[graph_cols]], axis=1)

# Save feature order for prediction
with open("feature_order.txt", "w") as f:
    f.write("\n".join(X_all.columns))

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_all)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)



In [13]:
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_smote, y_smote)

iso = IsolationForest(contamination=0.01, random_state=42)
iso.fit(X_train)

In [14]:
joblib.dump(rf, "models/rf_model.pkl")
joblib.dump(iso, "models/iso_model.pkl")
joblib.dump(encoder, "models/encoder.pkl")
joblib.dump(scaler, "models/scaler.pkl")

['models/scaler.pkl']

In [15]:
y_pred = rf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

y_proba = rf.predict_proba(X_test)[:, 1]
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    197794
           1       0.28      0.05      0.08      2206

    accuracy                           0.99    200000
   macro avg       0.64      0.52      0.54    200000
weighted avg       0.98      0.99      0.98    200000

ROC-AUC Score: 0.8466544084149346


In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_grid={
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', None]
}

rf= RandomForestClassifier()
search = RandomizedSearchCV(rf, param_grid, n_iter=10, cv=3, scoring='recall', random_state=42)
search.fit(X_smote, y_smote)
print("Best parameters found: ", search.best_params_)


In [None]:
iso = IsolationForest(contamination=0.02, n_estimators=300, random_state=42, max_samples=0.9)
iso.fit(X_train)