In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import os
import requests


# Define paths
drive_path = '/content/drive/My Drive/'
dataset_path = os.path.join(drive_path, 'Fraud.csv')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv(dataset_path)

In [4]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
df.isnull().sum()

Unnamed: 0,0
step,0
type,0
amount,0
nameOrig,0
oldbalanceOrg,0
newbalanceOrig,0
nameDest,0
oldbalanceDest,0
newbalanceDest,0
isFraud,0


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [8]:
# One-hot encode 'type'
df_encoded = pd.get_dummies(df, columns=['type'], drop_first=True)

# Label encode 'nameOrig' and 'nameDest'
label_encoder = LabelEncoder()
df_encoded['nameOrig'] = label_encoder.fit_transform(df_encoded['nameOrig'])
df_encoded['nameDest'] = label_encoder.fit_transform(df_encoded['nameDest'])

In [9]:
df_encoded.corr()

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
step,1.0,-0.025316,-0.000223,-0.005684,-0.006077,0.013204,-0.001815,-0.019174,0.04509,,-0.012585,-0.007266,0.014719,0.004437
amount,-0.025316,1.0,0.001702,0.004804,-0.001199,-0.340821,0.215394,0.311775,0.129342,,0.071154,-0.047818,-0.397385,0.539283
nameOrig,-0.000223,0.001702,1.0,8.9e-05,0.00013,-0.001191,0.00178,0.001921,-9.1e-05,,-0.000136,0.001576,-0.001261,0.000947
oldbalanceOrg,-0.005684,0.004804,8.9e-05,1.0,0.999047,-0.16018,0.09332,0.064048,0.003878,,-0.204698,-0.02208,-0.186211,-0.082012
newbalanceOrig,-0.006077,-0.001199,0.00013,0.999047,1.0,-0.163493,0.095198,0.063724,-0.009421,,-0.2147,-0.02246,-0.190069,-0.087849
nameDest,0.013204,-0.340821,-0.001191,-0.16018,-0.163493,1.0,-0.260543,-0.282058,-0.019706,,-0.455833,-0.051128,0.858566,-0.183411
oldbalanceDest,-0.001815,0.215394,0.00178,0.09332,0.095198,-0.260543,1.0,0.978386,-0.007506,,0.129993,0.010525,-0.303838,0.130246
newbalanceDest,-0.019174,0.311775,0.001921,0.064048,0.063724,-0.282058,0.978386,1.0,-0.000424,,0.160024,0.007654,-0.328944,0.197603
isFraud,0.04509,0.129342,-9.1e-05,0.003878,-0.009421,-0.019706,-0.007506,-0.000424,1.0,,0.010298,-0.002731,-0.023504,0.049176
isFlaggedFraud,,,,,,,,,,,,,,


In [10]:
X = df_encoded.drop(['isFraud', 'isFlaggedFraud'], axis=1)
y = df_encoded['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.9997374200324645
Confusion Matrix:
[[209233      6]
 [    49    172]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209239
           1       0.97      0.78      0.86       221

    accuracy                           1.00    209460
   macro avg       0.98      0.89      0.93    209460
weighted avg       1.00      1.00      1.00    209460



In [None]:
#Such a high accuracy makes me think model is overfitting on text data so i check for that and then i attempt to fix it via oversampling of minority class

In [11]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean CV Accuracy: {np.mean(cv_scores)}")
print(f"Standard Deviation of CV Accuracy: {np.std(cv_scores)}")

Cross-Validation Accuracy Scores: [0.9991311  0.99956078 0.99971355 0.9999809  0.27474936]
Mean CV Accuracy: 0.8546271364460996
Standard Deviation of CV Accuracy: 0.28993902131495286


In [12]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

over = SMOTE(sampling_strategy=0.1)  # Over-sample minority class
under = RandomUnderSampler(sampling_strategy=0.5)  # Under-sample majority class

pipeline = Pipeline(steps=[('o', over), ('u', under), ('model', RandomForestClassifier(n_estimators=100, random_state=42))])
cv_scores_resampled = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')

print(f"Cross-Validation Accuracy Scores with Resampling: {cv_scores_resampled}")
print(f"Mean CV Accuracy with Resampling: {np.mean(cv_scores_resampled)}")
print(f"Standard Deviation of CV Accuracy with Resampling: {np.std(cv_scores_resampled)}")


Cross-Validation Accuracy Scores with Resampling: [0.9990022  0.99856775 0.96869092 0.98906712 0.10521341]
Mean CV Accuracy with Resampling: 0.8121082784302492
Standard Deviation of CV Accuracy with Resampling: 0.353618265096627


In [14]:
import xgboost as xgb

# Initialize the model
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')

# Evaluate using cross-validation
cv_scores_xgb = cross_val_score(xgb_model, X, y, cv=5, scoring='accuracy')

print(f"XGBoost Cross-Validation Accuracy Scores: {cv_scores_xgb}")
print(f"Mean CV Accuracy for XGBoost: {np.mean(cv_scores_xgb)}")
print(f"Standard Deviation of CV Accuracy for XGBoost: {np.std(cv_scores_xgb)}")


XGBoost Cross-Validation Accuracy Scores: [0.99910723 0.99155447 0.99009357 0.9844314  0.70032942]
Mean CV Accuracy for XGBoost: 0.9331032177981478
Standard Deviation of CV Accuracy for XGBoost: 0.1164810738001414


In [None]:
#the model was initially overfittin to the data so after applying xgboost and applying sampling techniques to oversample and undersample we are able to achieve much higher performance than before