In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/online-payment-fraud-detection/onlinefraud.csv


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Load the dataset
dataset_path = '/kaggle/input/online-payment-fraud-detection/onlinefraud.csv'
dataset = pd.read_csv(dataset_path)

# Display the first few rows of the dataset
print(dataset.head())

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


In [6]:
# Check for missing values
print(dataset.isnull().sum())

# If there were missing values, you could fill them like this:
# dataset.fillna(method='ffill', inplace=True)

# Feature Engineering
# One-hot encoding for 'type'
dataset = pd.get_dummies(dataset, columns=['type'], drop_first=True)

# Creating additional features
dataset['errorBalanceOrig'] = dataset['newbalanceOrig'] + dataset['amount'] - dataset['oldbalanceOrg']
dataset['errorBalanceDest'] = dataset['oldbalanceDest'] + dataset['amount'] - dataset['newbalanceDest']


step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [7]:
# Dropping unnecessary columns
dataset.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

# Normalizing numerical features
scaler = MinMaxScaler()
dataset[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'errorBalanceOrig', 'errorBalanceDest']] = scaler.fit_transform(
    dataset[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'errorBalanceOrig', 'errorBalanceDest']]
)

# Display the first few rows of the processed dataset
print(dataset.head())

   step    amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
0     1  0.000106       0.002855        0.003233        0.000000   
1     1  0.000020       0.000357        0.000391        0.000000   
2     1  0.000002       0.000003        0.000000        0.000000   
3     1  0.000002       0.000003        0.000000        0.000059   
4     1  0.000126       0.000697        0.000603        0.000000   

   newbalanceDest  isFraud  isFlaggedFraud  type_CASH_OUT  type_DEBIT  \
0             0.0        0               0          False       False   
1             0.0        0               0          False       False   
2             0.0        1               0          False       False   
3             0.0        1               0           True       False   
4             0.0        0               0          False       False   

   type_PAYMENT  type_TRANSFER  errorBalanceOrig  errorBalanceDest  
0          True          False      1.081718e-10          0.852022  
1          Tru

In [8]:
# Splitting the data into features and target
X = dataset.drop(columns=['isFraud'])
y = dataset['isFraud']

# Split the data: 70% train, 15% validation, 15% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f'Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}')

Training set: (4453834, 13), Validation set: (954393, 13), Test set: (954393, 13)


In [9]:
# Decision Tree Model Training and Evaluation
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions
y_val_pred_dt = dt_model.predict(X_val)
y_val_pred_proba_dt = dt_model.predict_proba(X_val)[:, 1]

# Evaluate the model
precision_dt = precision_score(y_val, y_val_pred_dt)
recall_dt = recall_score(y_val, y_val_pred_dt)
f1_dt = f1_score(y_val, y_val_pred_dt)
roc_auc_dt = roc_auc_score(y_val, y_val_pred_proba_dt)

print(f'Decision Tree - Precision: {precision_dt:.4f}, Recall: {recall_dt:.4f}, F1-Score: {f1_dt:.4f}, ROC-AUC: {roc_auc_dt:.4f}')


Decision Tree - Precision: 0.9935, Recall: 0.9878, F1-Score: 0.9906, ROC-AUC: 0.9939


In [10]:
accuracy = accuracy_score(y_val, y_val_pred_dt)
print(f'Accuracy on Validation Set: {accuracy:.4f}')

Accuracy on Validation Set: 1.0000


In [7]:
# # Random Forest Model Training and Evaluation
# rf_model = RandomForestClassifier(random_state=42)
# rf_model.fit(X_train, y_train)

# # Make predictions
# y_val_pred_rf = rf_model.predict(X_val)
# y_val_pred_proba_rf = rf_model.predict_proba(X_val)[:, 1]

In [None]:
# # Evaluate the model
# precision_rf = precision_score(y_val, y_val_pred_rf)
# recall_rf = recall_score(y_val, y_val_pred_rf)
# f1_rf = f1_score(y_val, y_val_pred_rf)
# roc_auc_rf = roc_auc_score(y_val, y_val_pred_proba_rf)

# print(f'Random Forest - Precision: {precision_rf:.4f}, Recall: {recall_rf:.4f}, F1-Score: {f1_rf:.4f}, ROC-AUC: {roc_auc_rf:.4f}')
