In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier
import joblib


In [2]:
df = pd.read_csv("/home/haidau_rares/projects/fraud_detection_iphones/clean_data.csv")

print("Shape:", df.shape)
df.head()


Shape: (296, 10)


Unnamed: 0,phone_model,memory_size,condition,number_of_photos,publisher_rating,publisher_num_ratings,publisher_join_date,clean_price,price_ratio,fraud_flag
0,iPhone 1st gen,256,Utilizat,3,4.7,29,octombrie 2023,6500.0,1.420765,0
1,iPhone 1st gen,512,Utilizat,2,5.0,55,iulie 2020,7300.0,1.0,0
2,iPhone 13,128,Utilizat,5,4.7,29,octombrie 2023,1600.0,1.0,0
3,iPhone 12 Mini,128,Utilizat,5,4.7,29,octombrie 2023,700.0,0.823529,1
4,iPhone 13 Pro,256,Utilizat,5,4.7,29,octombrie 2023,2100.0,1.02439,0


In [3]:
target = "fraud_flag"

features = [
    "phone_model",
    "memory_size",
    "condition",
    "number_of_photos",
    "publisher_rating",
    "publisher_num_ratings",
    "clean_price",
    "price_ratio"
]

X = df[features]
y = df[target]


In [4]:
categorical_features = ["phone_model", "condition"]
numeric_features = [
    "memory_size",
    "number_of_photos",
    "publisher_rating",
    "publisher_num_ratings",
    "clean_price",
    "price_ratio"
]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numeric_features),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_features)
    ]
)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (236, 8)
Test size: (60, 8)


In [6]:
model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", model)
])


In [7]:
pipeline.fit(X_train, y_train)
print("Training completed.")


Training completed.


In [8]:
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))


Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93        37
           1       0.88      0.91      0.89        23

    accuracy                           0.92        60
   macro avg       0.91      0.92      0.91        60
weighted avg       0.92      0.92      0.92        60

Confusion Matrix:
 [[34  3]
 [ 2 21]]
ROC-AUC Score: 0.982373678025852


In [9]:
joblib.dump(pipeline, "/home/haidau_rares/projects/fraud_detection_iphones/fraud_model_xgb.pkl")
print("Model saved successfully!")


Model saved successfully!


In [10]:
model_loaded = joblib.load("/home/haidau_rares/projects/fraud_detection_iphones/fraud_model_xgb.pkl")
print("Model loaded:", model_loaded)


Model loaded: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='median'),
                                                  ['memory_size',
                                                   'number_of_photos',
                                                   'publisher_rating',
                                                   'publisher_num_ratings',
                                                   'clean_price',
                                                   'price_ratio']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHo