In [1]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists
cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [2]:
# https://www.kaggle.com/datasets/sriharshaeedala/financial-fraud-detection-dataset/data
!kaggle datasets download -d sriharshaeedala/financial-fraud-detection-dataset

Dataset URL: https://www.kaggle.com/datasets/sriharshaeedala/financial-fraud-detection-dataset
License(s): CC-BY-SA-4.0
financial-fraud-detection-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
!unzip financial-fraud-detection-dataset.zip

Archive:  financial-fraud-detection-dataset.zip
replace Synthetic_Financial_datasets_log.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

# Import libraries

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv("/content/Synthetic_Financial_datasets_log.csv")


print(f"Number of samples: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")

Number of samples: 6362620
Number of features: 11


In [5]:
# Check the dataset structure
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


# Prepare Data

In [6]:
# selecting the columns of numerical type
numeric_columns = df.select_dtypes(include=['int', 'float']).columns

numeric_data = df[numeric_columns]

# pearson corrleation matrix of the numerical data
correlation = numeric_data.corr()

In [7]:
print(f"correlation between all features and Class \n{(correlation['isFraud'].sort_values(ascending=False))}")

correlation between all features and Class 
isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64


In [8]:
target_correlation = correlation['isFraud'].drop('isFraud')


In [9]:
threshold= 0.02
features=target_correlation[abs(target_correlation) >= threshold].index

In [10]:
features=features[:2]

In [11]:
copy_df=df[features].join(df['type'])
copy_df=copy_df.join(df['isFraud'])
df=copy_df

In [12]:
df.head()

Unnamed: 0,step,amount,type,isFraud
0,1,9839.64,PAYMENT,0
1,1,1864.28,PAYMENT,0
2,1,181.0,TRANSFER,1
3,1,181.0,CASH_OUT,1
4,1,11668.14,PAYMENT,0


In [13]:
# Encode the 'type' column
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

In [14]:
df = df.dropna(subset=['isFraud'])

In [15]:
# Separate feature variables and target variable
X = df.drop('isFraud', axis=1)
y = df['isFraud']

In [16]:
# Standardize the data
scaler = MinMaxScaler()
df['amount'] = scaler.fit_transform(df[['amount']])

In [17]:
# Initialize SMOTE and apply it to the training data only
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

In [18]:
df2 = pd.DataFrame(X_smote)
df2['isFraud'] = y_smote
df2

Unnamed: 0,step,amount,type,isFraud
0,1,9.839640e+03,3,0
1,1,1.864280e+03,3,0
2,1,1.810000e+02,4,1
3,1,1.810000e+02,1,1
4,1,1.166814e+04,3,0
...,...,...,...,...
12708809,253,1.620903e+06,1,1
12708810,617,1.000000e+07,1,1
12708811,110,9.041246e+05,1,1
12708812,48,1.367124e+05,2,1


In [19]:
df2.shape

(12708814, 4)

In [20]:
df2.drop_duplicates(inplace=True)
df2.shape

(11260389, 4)

In [21]:
import pickle

# Save datasets
with open('df2.pkl', 'wb') as f:
    pickle.dump(df2, f)

## Split Data

In [22]:
# Separate feature variables and target variable
X = df2.drop('isFraud', axis=1)
y = df2['isFraud']

In [23]:
# Split the data into train (80%), validation (10%), and test sets (10%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Build Models

# Logitic regression

In [24]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()

In [25]:
LR.fit(X_train, y_train)

In [26]:
# Validate the model on the validation set
y_val_pred = LR.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"Validation Classification Report:\n {classification_report(y_val, y_val_pred)}")

# Test the model on the test set
y_test_pred = LR.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Test Classification Report:\n {classification_report(y_test, y_test_pred)}")

Validation Accuracy: 0.7919077403180529
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.91      0.83    634692
           1       0.84      0.64      0.73    491347

    accuracy                           0.79   1126039
   macro avg       0.80      0.77      0.78   1126039
weighted avg       0.80      0.79      0.79   1126039

Test Accuracy: 0.791160874534541
Test Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.91      0.83    634692
           1       0.84      0.64      0.73    491347

    accuracy                           0.79   1126039
   macro avg       0.80      0.77      0.78   1126039
weighted avg       0.80      0.79      0.79   1126039



In [27]:
# Train
y_train_pred = LR.predict(X_train)
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred)}")
print(f"Train Classification Report:\n {classification_report(y_train, y_train_pred)}")

Train Accuracy: 0.7919794287741619
Train Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.91      0.83   5077536
           1       0.84      0.64      0.73   3930775

    accuracy                           0.79   9008311
   macro avg       0.80      0.78      0.78   9008311
weighted avg       0.80      0.79      0.79   9008311



# Decision Tree

In [28]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()

In [29]:
DT.fit(X_train, y_train)

In [30]:
# Validate the model on the validation set
y_val_pred = DT.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"Validation Classification Report:\n {classification_report(y_val, y_val_pred)}")

# Test the model on the test set
y_test_pred = DT.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Test Classification Report:\n {classification_report(y_test, y_test_pred)}")

Validation Accuracy: 0.9757654930246643
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98    634692
           1       0.96      0.98      0.97    491347

    accuracy                           0.98   1126039
   macro avg       0.97      0.98      0.98   1126039
weighted avg       0.98      0.98      0.98   1126039

Test Accuracy: 0.9758729493383445
Test Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98    634692
           1       0.96      0.98      0.97    491347

    accuracy                           0.98   1126039
   macro avg       0.97      0.98      0.98   1126039
weighted avg       0.98      0.98      0.98   1126039



In [31]:
# Train
y_train_pred = DT.predict(X_train)
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred)}")
print(f"Train Classification Report:\n {classification_report(y_train, y_train_pred)}")

Train Accuracy: 0.9999924514151431
Train Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   5077536
           1       1.00      1.00      1.00   3930775

    accuracy                           1.00   9008311
   macro avg       1.00      1.00      1.00   9008311
weighted avg       1.00      1.00      1.00   9008311



# Random Forest

In [32]:
# Initialize Random Forest Classifier
clf = RandomForestClassifier(random_state=42)

In [None]:
# Train the model on the resampled training data
clf.fit(X_train, y_train)

In [None]:
# Validate the model on the validation set
y_val_pred = clf.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"Validation Classification Report:\n {classification_report(y_val, y_val_pred)}")

# Test the model on the test set
y_test_pred = clf.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Test Classification Report:\n {classification_report(y_test, y_test_pred)}")

Validation Accuracy: 0.9678545769729112
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97    634692
           1       0.95      0.98      0.96    491347

    accuracy                           0.97   1126039
   macro avg       0.97      0.97      0.97   1126039
weighted avg       0.97      0.97      0.97   1126039

Test Accuracy: 0.9680153174090773
Test Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97    634692
           1       0.95      0.98      0.96    491347

    accuracy                           0.97   1126039
   macro avg       0.97      0.97      0.97   1126039
weighted avg       0.97      0.97      0.97   1126039



In [None]:
# Train
y_train_pred = clf.predict(X_train)
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred)}")
print(f"Train Classification Report:\n {classification_report(y_train, y_train_pred)}")

Train Accuracy: 0.9999832377012738
Train Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   5077536
           1       1.00      1.00      1.00   3930775

    accuracy                           1.00   9008311
   macro avg       1.00      1.00      1.00   9008311
weighted avg       1.00      1.00      1.00   9008311



In [None]:
clf.score(X_train, y_train)

In [None]:
clf.predict([[1	,181.00	,1	]])

In [None]:
y_val_pred

In [None]:
clf.predict([[1	,11668.14	,3	]])

In [None]:
import pickle

# Save the model
with open('RandomForest_model.pkl', 'wb') as f:
    pickle.dump(clf, f)


In [None]:

with open('DecisionTree_model.pkl', 'wb') as f:
    pickle.dump(clf, f)


In [None]:
S = np.array([[ 29321424.,  12418817.,  -2901417.],
 [-43431152. , 14825383.  ,34996340.],
 [ 35928588. ,-41231700., -39343300.]])

In [None]:
DT.predict(S)

In [None]:
s = np.array([[-19448110.,  51600428., -56082636.],
 [-27089762. , 14075190.  , 2285261.],
 [-47130472. , 25616126. ,-11983711.]])

In [None]:
DT.predict(s)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Logistic Regression
y_train_pred_LR = LR.predict(X_train)
y_val_pred_LR = LR.predict(X_val)
y_test_pred_LR = LR.predict(X_test)

# Decision Tree
y_train_pred_DT = DT.predict(X_train)
y_val_pred_DT = DT.predict(X_val)
y_test_pred_DT = DT.predict(X_test)

# Random Forest
y_train_pred_RF = RF.predict(X_train)
y_val_pred_RF = RF.predict(X_val)
y_test_pred_RF = RF.predict(X_test)

# Create a comparison table
results = {
    'Metric': ['Train Accuracy', 'Validation Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'F1-Score'],
    'Logistic Regression': [
        accuracy_score(y_train, y_train_pred_LR),
        accuracy_score(y_val, y_val_pred_LR),
        accuracy_score(y_test, y_test_pred_LR),
        # Add precision, recall, and F1-score using classification_report
    ],
    'Decision Tree': [
        accuracy_score(y_train, y_train_pred_DT),
        accuracy_score(y_val, y_val_pred_DT),
        accuracy_score(y_test, y_test_pred_DT),
        # Add precision, recall, and F1-score
    ],
    'Random Forest': [
        accuracy_score(y_train, y_train_pred_RF),
        accuracy_score(y_val, y_val_pred_RF),
        accuracy_score(y_test, y_test_pred_RF),
        # Add precision, recall, and F1-score
    ]
}

import pandas as pd

# Create a DataFrame for better visualization
comparison_df = pd.DataFrame(results)
print(comparison_df)
