In [1]:
import pandas as pd

# Load processed fraud data
fraud_df = pd.read_csv("../data/processed/fraud_processed.csv")

# Quick check
fraud_df.head()
fraud_df.info()
fraud_df['class'].value_counts(normalize=True)  # check imbalance


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129146 entries, 0 to 129145
Data columns (total 19 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   user_id                 129146 non-null  int64  
 1   signup_time             129146 non-null  object 
 2   purchase_time           129146 non-null  object 
 3   purchase_value          129146 non-null  int64  
 4   device_id               129146 non-null  object 
 5   source                  129146 non-null  object 
 6   browser                 129146 non-null  object 
 7   sex                     129146 non-null  object 
 8   age                     129146 non-null  int64  
 9   ip_address              129146 non-null  int64  
 10  class                   129146 non-null  int64  
 11  lower_bound_ip_address  129146 non-null  float64
 12  upper_bound_ip_address  129146 non-null  float64
 13  country                 129146 non-null  object 
 14  time_since_signup   

class
0    0.905007
1    0.094993
Name: proportion, dtype: float64

In [2]:
from sklearn.model_selection import train_test_split

# Features
X = fraud_df.drop(columns=['class'])  

# Target
y = fraud_df['class']

# Stratified train-test split (keep class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Numeric and categorical columns
num_cols = ['purchase_value', 'age', 'time_since_signup', 'purchase_hour']  # adjust
cat_cols = ['sex', 'source', 'browser']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first'), cat_cols)
    ]
)

# Pipeline with Logistic Regression
from sklearn.linear_model import LogisticRegression

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

# Train
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:,1]

# Evaluate
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[15003  8373]
 [  703  1751]]
              precision    recall  f1-score   support

           0       0.96      0.64      0.77     23376
           1       0.17      0.71      0.28      2454

    accuracy                           0.65     25830
   macro avg       0.56      0.68      0.52     25830
weighted avg       0.88      0.65      0.72     25830



In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report


In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

num_cols = ['purchase_value', 'age', 'time_since_signup', 'purchase_hour']  # numeric
cat_cols = ['sex', 'source', 'browser']  # categorical

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first'), cat_cols)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200, max_depth=10, class_weight='balanced', random_state=42
    ))
])

# Train
pipeline.fit(X_train, y_train)

# Predict
y_pred_rf = pipeline.predict(X_test)
y_prob_rf = pipeline.predict_proba(X_test)[:,1]

# Evaluate
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


[[23376     0]
 [ 1103  1351]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     23376
           1       1.00      0.55      0.71      2454

    accuracy                           0.96     25830
   macro avg       0.98      0.78      0.84     25830
weighted avg       0.96      0.96      0.95     25830



In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
import numpy as np

num_cols = ['purchase_value', 'age', 'time_since_signup', 'purchase_hour']
cat_cols = ['sex', 'source', 'browser']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first'), cat_cols)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200, max_depth=10, class_weight='balanced', random_state=42
    ))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=skf, scoring='f1')

print("F1 scores:", scores)
print("Mean F1:", np.mean(scores))


F1 scores: [0.69588313 0.70928703 0.68821801 0.70261697 0.71644909]
Mean F1: 0.7024908455251105


In [7]:
import joblib
import os

# Ensure models directory exists
os.makedirs("../models", exist_ok=True)

# pipeline is your trained Pipeline (preprocessing + RF)
joblib.dump(pipeline, r"../models/rf_fraud_pipeline.pkl")
print("Random Forest pipeline saved successfully!")



Random Forest pipeline saved successfully!


In [8]:
fraud_df['is_fraud'] = np.where(fraud_df['class'] == 'fraud', 1, 0)
fraud_df['is_fraud'].value_counts(normalize=True)


is_fraud
0    1.0
Name: proportion, dtype: float64

In [9]:
from imblearn.pipeline import Pipeline   # NOT sklearn.pipeline
from imblearn.over_sampling import SMOTE


In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

num_cols = ['purchase_value', 'age', 'time_since_signup', 'hour_of_day']
cat_cols = ['sex', 'source', 'browser']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        random_state=42
    ))
])


In [11]:
pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('smote', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [12]:
from collections import Counter
print("Class distribution BEFORE SMOTE:", Counter(y_train))


Class distribution BEFORE SMOTE: Counter({0: 93502, 1: 9814})


In [13]:
from collections import Counter
print("Class distribution BEFORE SMOTE:", Counter(y_train))


Class distribution BEFORE SMOTE: Counter({0: 93502, 1: 9814})


In [14]:
X_train.dtypes


user_id                     int64
signup_time                object
purchase_time              object
purchase_value              int64
device_id                  object
source                     object
browser                    object
sex                        object
age                         int64
ip_address                  int64
lower_bound_ip_address    float64
upper_bound_ip_address    float64
country                    object
time_since_signup         float64
hour_of_day                 int64
day_of_week                 int64
transactions_per_user       int64
purchase_hour               int64
dtype: object

In [15]:
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])
X_test_num  = X_test.select_dtypes(include=['int64', 'float64'])

print(X_train_num.shape)
print(X_test_num.shape)


(103316, 11)
(25830, 11)


In [16]:
print(X_train_num.isnull().sum().sum())


0


In [17]:
from imblearn.over_sampling import SMOTE
from collections import Counter

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_num, y_train)

print("Class distribution AFTER SMOTE:", Counter(y_train_res))


Class distribution AFTER SMOTE: Counter({0: 93502, 1: 93502})


In [18]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_res, y_train_res)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [19]:
from sklearn.metrics import classification_report

y_pred = rf_model.predict(X_test_num)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.98      0.97     23376
           1       0.75      0.56      0.64      2454

    accuracy                           0.94     25830
   macro avg       0.85      0.77      0.80     25830
weighted avg       0.94      0.94      0.94     25830



In [20]:
X_test_num


Unnamed: 0,user_id,purchase_value,age,ip_address,lower_bound_ip_address,upper_bound_ip_address,time_since_signup,hour_of_day,day_of_week,transactions_per_user,purchase_hour
116396,360572,34,31,2487467045,2.487419e+09,2.487484e+09,624.725833,0,4,1,0
79018,244596,18,24,1177415539,1.177354e+09,1.177420e+09,908.137778,20,5,1,20
36486,113421,39,32,1192301392,1.192296e+09,1.192362e+09,619.768056,22,4,1,22
11359,35236,19,38,3629014908,3.628859e+09,3.629122e+09,0.000278,23,4,1,23
52721,163254,10,27,3621382730,3.607101e+09,3.623879e+09,0.000278,4,6,1,4
...,...,...,...,...,...,...,...,...,...,...,...
97904,303219,21,33,1896319754,1.895825e+09,1.896350e+09,455.967222,11,6,1,11
52376,162292,44,20,2287148511,2.285896e+09,2.290090e+09,485.448889,4,5,1,4
111210,344464,50,44,545976713,5.368709e+08,5.536481e+08,1566.488889,17,2,1,17
3656,11368,10,36,362435568,3.523215e+08,3.690988e+08,321.081667,22,5,1,22


In [21]:
import pandas as pd

X_train = pd.read_csv(
    r"C:\Users\jkk\OneDrive\Desktop\fraud-detection\data\processed\X_train.csv"
)
y_train = pd.read_csv(
    r"C:\Users\jkk\OneDrive\Desktop\fraud-detection\data\processed\y_train.csv"
).squeeze()


In [22]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_train.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', rf)
    ]
)


In [23]:
rf_pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [24]:
import joblib

joblib.dump(
    rf_pipeline,
    r"C:\Users\jkk\OneDrive\Desktop\fraud-detection\models\rf_fraud_pipeline.pkl"
)


['C:\\Users\\jkk\\OneDrive\\Desktop\\fraud-detection\\models\\rf_fraud_pipeline.pkl']

In [25]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)


In [26]:
print(X_train.isnull().sum().sum())
print(y_train.isnull().sum())


0
0


In [27]:
X_train = X_train.fillna(0)


In [28]:
type(X_train)


pandas.core.frame.DataFrame

In [29]:
X_train.dtypes


user_id                     int64
signup_time                object
purchase_time              object
purchase_value              int64
device_id                  object
source                     object
browser                    object
sex                        object
age                         int64
ip_address                  int64
lower_bound_ip_address    float64
upper_bound_ip_address    float64
country                    object
time_since_signup         float64
hour_of_day                 int64
day_of_week                 int64
transactions_per_user       int64
ip_int                      int64
dtype: object

In [30]:
cols_to_drop = [
    'signup_time',
    'purchase_time',
    'device_id',
    'source',
    'browser',
    'sex',
    'country'
]

X_train = X_train.drop(columns=cols_to_drop)
X_test  = X_test.drop(columns=cols_to_drop)


In [31]:
print(X_train.shape)
print(X_test.shape)


(103316, 11)
(25830, 11)


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


In [36]:
X_test = X_test[X_train.columns]


In [37]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_train.select_dtypes(include=['object']).columns


In [38]:
OneHotEncoder(handle_unknown='ignore')


0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [39]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', rf)
    ]
)

# Fit pipeline on X_train
rf_pipeline.fit(X_train, y_train)

# Make predictions on X_test
X_test = X_test[X_train.columns]  # Ensures matching columns
y_pred = rf_pipeline.predict(X_test)
y_proba = rf_pipeline.predict_proba(X_test)[:,1]


In [40]:
set(X_train.columns) - set(X_test.columns)  # Should be empty
set(X_test.columns) - set(X_train.columns)  # Should be empty


set()

In [33]:
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, confusion_matrix


In [41]:
# Predictions
y_pred = rf_pipeline.predict(X_test)
y_proba = rf_pipeline.predict_proba(X_test)[:,1]

# Metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_pr = average_precision_score(y_test, y_proba)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("AUC-PR:", auc_pr)


Precision: 0.0
Recall: 0.0
F1-score: 0.0
AUC-PR: 0.6277902531098929


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [42]:
# Default probability predictions
y_proba = rf_pipeline.predict_proba(X_test)[:,1]

# Set threshold lower, e.g., 0.3
y_pred_new = (y_proba >= 0.3).astype(int)

from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, y_pred_new)
recall = recall_score(y_test, y_pred_new)
f1 = f1_score(y_test, y_pred_new)

print("Adjusted Threshold Metrics")
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Adjusted Threshold Metrics
Precision: 0.0
Recall: 0.0
F1-score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [44]:
from imblearn.over_sampling import SMOTE

# 1. Preprocess X_train (numerics + encoded categoricals)
X_train_processed = preprocessor.fit_transform(X_train)

# 2. Apply SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train_processed, y_train)

# 3. Fit the model directly on resampled data
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_res, y_res)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [45]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'  # optional if you use SMOTE
)


In [46]:
rf.fit(X_train_res, y_train_res)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [49]:
# Get feature names from preprocessor
def get_feature_names(column_transformer):
    feature_names = []

    for name, transformer, columns in column_transformer.transformers_:
        if name != 'remainder':
            if hasattr(transformer, 'get_feature_names_out'):
                names = transformer.get_feature_names_out(columns)
            else:
                names = columns
            feature_names.extend(names)
    return feature_names

# Apply to your preprocessor
feature_names = get_feature_names(preprocessor)
len(feature_names), X_sample_preprocessed.shape[1]  # should match


(302161, 302161)

In [51]:
import shap

# Sample 1000 rows from X_test
X_sample = X_test.sample(1000, random_state=42)

# Use new SHAP API with pipeline
explainer = shap.Explainer(rf_pipeline, X_sample)  # pass the whole pipeline + raw data
shap_values = explainer(X_sample)  # outputs a SHAP object

# Summary plot (top 20 features)
shap.summary_plot(shap_values, max_display=20)


TypeError: The passed model is not callable and cannot be analyzed directly with the given masker! Model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  Index(['user_id', 'purchase_value', 'age', 'ip_address',
       'lower_bound_ip_address', 'upper_bound_ip_address', 'time_since_signup',
       'hour_of_day', 'day_of_week', 'transactions_per_user', 'purchase_hour'],
      dtype='object')),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  Index(['signup_time', 'purchase_time', 'device_id', 'source', 'browser', 'sex',
       'country'],
      dtype='object'))])),
                ('classifier',
                 RandomForestClassifier(max_depth=10, n_estimators=200,
                                        n_jobs=-1, random_state=42))])