In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("danielolusesi/financial-fraud-detection")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/danielolusesi/financial-fraud-detection?dataset_version_number=1...


100%|███████████████████████████████████████| 25.0k/25.0k [00:00<00:00, 177kB/s]

Extracting files...
Path to dataset files: /Users/mac/.cache/kagglehub/datasets/danielolusesi/financial-fraud-detection/versions/1





In [256]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.svm import SVC

In [257]:
df = pd.read_csv("fraud data.csv")

In [258]:
df.head()

Unnamed: 0,Transaction_Amount,Merchant_Category,User_Country,Transaction_Country,Device_Type,Transaction_Day,User_Avg_Spend,User_Transaction_Frequency,Is_Fraud
0,27067,Hotels,China,China,Laptop,Weekend,6753,7,Fraud
1,16167,Bakeries,China,Brazil,Tablet,Weekend,4457,3,Fraud
2,3433,Taxi services,Argentina,Argentina,Mobile,Weekend,3687,2,No Fraud
3,24272,Liquor stores,Australia,South Africa,Mobile,Weekday,6403,9,Fraud
4,7700,Clothing stores,Australia,South Africa,Laptop,Weekday,6671,8,Fraud


In [259]:
le = LabelEncoder()

In [260]:
df['Merchant_Category'] = le.fit_transform(df['Merchant_Category'])

In [261]:
df['User_Country'] = le.fit_transform(df['User_Country'])

In [262]:
df['Transaction_Country'] = le.fit_transform(df['Transaction_Country'])

In [263]:
df['Device_Type'] = le.fit_transform(df['Device_Type'])

In [264]:
df['Transaction_Day'] = le.fit_transform(df['Transaction_Day'])

In [265]:
df['Is_Fraud'] = le.fit_transform(df['Is_Fraud'])

In [266]:
df['Is_Fraud'].value_counts(normalize=True)

Is_Fraud
0    0.634
1    0.366
Name: proportion, dtype: float64

In [267]:
X = df.drop(columns=['Is_Fraud'])
y = df['Is_Fraud']

In [268]:
y.shape

(2000,)

In [269]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [270]:
smote = SMOTE()

In [271]:
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [215]:
pipeline = Pipeline([
    ('ss', StandardScaler()),
    ('rf', RandomForestClassifier())
])

In [216]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=30)

In [162]:
params = {
    'rf__n_estimators': range(25, 100, 10),
    'rf__max_depth': range(10, 50, 10)
}

In [163]:
model = GridSearchCV(
    pipeline,
    param_grid = params,
    cv = cv,
    n_jobs = 5,
    verbose =1
)

In [164]:
model.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [165]:
model.score(X_train_smote, y_train_smote)

1.0

In [167]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.91      0.94       280
           1       0.82      0.95      0.88       120

    accuracy                           0.92       400
   macro avg       0.90      0.93      0.91       400
weighted avg       0.93      0.92      0.92       400



In [170]:
from sklearn.tree import DecisionTreeClassifier

In [174]:
 pipeline1 = Pipeline([
    ('ss', StandardScaler()),
    ('dtc', DecisionTreeClassifier())
])

In [175]:
params1 = {
    'dtc__max_depth': range(10, 50, 10),
    'dtc__min_samples_split': range(25, 100, 10),
    'dtc__min_samples_leaf': range(25, 100, 10)
}

In [176]:
model1 = GridSearchCV(
    pipeline1,
    param_grid = params1,
    cv = cv,
    n_jobs = 5,
    verbose = 1
)

In [178]:
model1.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


In [179]:
model1.score(X_train_smote, y_train_smote)

0.9089068825910931

In [180]:
print(classification_report(y_test, model1.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      0.87      0.92       280
           1       0.76      0.93      0.84       120

    accuracy                           0.89       400
   macro avg       0.86      0.90      0.88       400
weighted avg       0.90      0.89      0.89       400



In [186]:
pipeline2 = Pipeline([
    ('ss', StandardScaler()),
    ('svc', SVC())
])

In [187]:
params2 = {
    'svc__C': [0.1, 1, 10, 50],
    'svc__kernel': ['rbf']
}

In [188]:
model2 = GridSearchCV(
    pipeline2,
    param_grid = params2,
    cv = cv,
    n_jobs = 5,
    verbose = 1
)

In [189]:
model2.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [190]:
model2.score(X_train_smote, y_train_smote)

0.9281376518218624

In [191]:
print(classification_report(y_test, model2.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.85      0.91       280
           1       0.74      0.97      0.84       120

    accuracy                           0.89       400
   macro avg       0.86      0.91      0.88       400
weighted avg       0.91      0.89      0.89       400



In [192]:
best_model = model.best_estimator_

In [247]:
import joblib

In [195]:
joblib.dump(best_model, "fraud_model_1.pkl")

['fraud_model_1.pkl']

In [218]:
joblib.dump(le, 'label_encoder.pkl')

['label_encoder.pkl']

In [239]:
print(le.classes_)

['Fraud' 'No Fraud']


['fraud_model_1_encoder.pkl']