In [24]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE


In [41]:
df = pd.read_csv('transaction.csv')
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [42]:
import numpy as np
# 2. Preprocessing

# 2.1. Retain only relevant transaction types (CASH_OUT and TRANSFER)
df = df[df['type'].isin(['CASH_OUT', 'TRANSFER'])]

# 2.2. Remove zero or negative amounts
df = df[df['amount'] > 0]

# 2.3. Check for zero balances and handle them
zero_originator = df[df['oldbalanceOrg'] == 0]
zero_recipient = df[df['oldbalanceDest'] == 0]

print(f'Number of transactions with originator balance of 0: {len(zero_originator)}')
print(f'Number of transactions with recipient balance of 0: {len(zero_recipient)}')

# 2.4. Calculate balance inaccuracies
df['origBalance_inacc'] = (df['oldbalanceOrg'] - df['amount']) - df['newbalanceOrig']
df['destBalance_inacc'] = (df['oldbalanceDest'] + df['amount']) - df['newbalanceDest']

# 2.5. Drop unnecessary columns (nameOrig and nameDest)
df = df.drop(columns=['nameOrig', 'nameDest'])

# 2.6. One-hot encode the 'type' column
df = pd.get_dummies(df, columns=['type'], prefix='type')

# 2.7. Time Step Processing
# Here you could include additional processing based on the timestamp if needed.
# For example, extracting hour from timestamp if 'step' is the timestamp
# 4. Feature Engineering: Time-Based Features
# Define constants
MINUTES_IN_HOUR = 60
MINUTES_IN_DAY = 24 * MINUTES_IN_HOUR

# Calculate minute within the day
df['minute_of_day'] = df['step'] % MINUTES_IN_DAY

# Extract hour and minute
df['hour'] = (df['minute_of_day'] // MINUTES_IN_HOUR).astype(int)
df['minute'] = (df['minute_of_day'] % MINUTES_IN_HOUR).astype(int)

# Create cyclical features
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['minute_sin'] = np.sin(2 * np.pi * df['minute'] / 60)
df['minute_cos'] = np.cos(2 * np.pi * df['minute'] / 60)

# (Optional) Extract day features if applicable
DAYS_IN_WEEK = 7
df['day'] = (df['step'] // MINUTES_IN_DAY).astype(int)
df['day_of_week'] = df['day'] % DAYS_IN_WEEK
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / DAYS_IN_WEEK)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / DAYS_IN_WEEK)

# Drop intermediate time-related columns
df.drop(['minute_of_day', 'hour', 'minute', 'day', 'day_of_week'], axis=1, inplace=True, errors='ignore')



Number of transactions with originator balance of 0: 1308566
Number of transactions with recipient balance of 0: 389314


In [43]:
# 3. Splitting the dataset
X = df.drop(columns=['isFraud'])
y = df['isFraud']



In [44]:
X.columns

Index(['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'isFlaggedFraud', 'origBalance_inacc',
       'destBalance_inacc', 'type_CASH_OUT', 'type_TRANSFER', 'hour_sin',
       'hour_cos', 'minute_sin', 'minute_cos', 'day_of_week_sin',
       'day_of_week_cos'],
      dtype='object')

In [45]:
# 4. Handling Class Imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)



In [46]:
X_resampled.head(10)

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud,origBalance_inacc,destBalance_inacc,type_CASH_OUT,type_TRANSFER,hour_sin,hour_cos,minute_sin,minute_cos,day_of_week_sin,day_of_week_cos
0,1,181.0,181.0,0.0,0.0,0.0,0,0.0,181.0,False,True,0.0,1.0,0.104528,0.994522,0.0,1.0
1,1,181.0,181.0,0.0,21182.0,0.0,0,0.0,21363.0,True,False,0.0,1.0,0.104528,0.994522,0.0,1.0
2,1,229133.94,15325.0,0.0,5083.0,51513.44,0,-213808.9,182703.5,True,False,0.0,1.0,0.104528,0.994522,0.0,1.0
3,1,215310.3,705.0,0.0,22425.0,0.0,0,-214605.3,237735.3,False,True,0.0,1.0,0.104528,0.994522,0.0,1.0
4,1,311685.89,10835.0,0.0,6267.0,2719172.89,0,-300850.9,-2401220.0,False,True,0.0,1.0,0.104528,0.994522,0.0,1.0
5,1,110414.71,26845.41,0.0,288800.0,2415.16,0,-83569.3,396799.55,True,False,0.0,1.0,0.104528,0.994522,0.0,1.0
6,1,56953.9,1942.02,0.0,70253.0,64106.18,0,-55011.88,63100.72,True,False,0.0,1.0,0.104528,0.994522,0.0,1.0
7,1,5346.89,0.0,0.0,652637.0,6453430.91,0,-5346.89,-5795447.02,True,False,0.0,1.0,0.104528,0.994522,0.0,1.0
8,1,23261.3,20411.53,0.0,25742.0,0.0,0,-2849.77,49003.3,True,False,0.0,1.0,0.104528,0.994522,0.0,1.0
9,1,62610.8,79114.0,16503.2,517.0,8383.29,0,-3.637979e-12,54744.51,False,True,0.0,1.0,0.104528,0.994522,0.0,1.0


In [30]:
# 5. Standardize the dataset
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)



In [47]:
X_resampled

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud,origBalance_inacc,destBalance_inacc,type_CASH_OUT,type_TRANSFER,hour_sin,hour_cos,minute_sin,minute_cos,day_of_week_sin,day_of_week_cos
0,1,1.810000e+02,1.810000e+02,0.0,0.0,0.000000e+00,0,0.00,1.810000e+02,False,True,0.000000,1.000000,0.104528,0.994522,0.0,1.0
1,1,1.810000e+02,1.810000e+02,0.0,21182.0,0.000000e+00,0,0.00,2.136300e+04,True,False,0.000000,1.000000,0.104528,0.994522,0.0,1.0
2,1,2.291339e+05,1.532500e+04,0.0,5083.0,5.151344e+04,0,-213808.94,1.827035e+05,True,False,0.000000,1.000000,0.104528,0.994522,0.0,1.0
3,1,2.153103e+05,7.050000e+02,0.0,22425.0,0.000000e+00,0,-214605.30,2.377353e+05,False,True,0.000000,1.000000,0.104528,0.994522,0.0,1.0
4,1,3.116859e+05,1.083500e+04,0.0,6267.0,2.719173e+06,0,-300850.89,-2.401220e+06,False,True,0.000000,1.000000,0.104528,0.994522,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5524387,306,1.751431e+06,1.751431e+06,0.0,0.0,0.000000e+00,0,0.00,1.751431e+06,False,True,0.965926,0.258819,0.601577,0.792225,0.0,1.0
5524388,297,5.699096e+06,5.699096e+06,0.0,0.0,0.000000e+00,0,0.00,5.699096e+06,False,True,0.900787,0.416077,0.253671,-0.224316,0.0,1.0
5524389,361,2.795646e+04,2.795646e+04,0.0,0.0,2.795646e+04,0,0.00,0.000000e+00,True,False,0.376118,0.074953,-0.516239,-0.360544,0.0,1.0
5524390,286,1.885185e+05,1.885185e+05,0.0,0.0,0.000000e+00,0,0.00,1.885185e+05,False,True,0.628284,0.412935,0.572714,-0.642234,0.0,1.0


In [31]:
# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)



In [49]:
X_train

array([[ 0.6143663 ,  4.79350973,  3.74460984, ...,  1.49750878,
         0.        ,  0.        ],
       [ 2.19606378, -0.3977534 , -0.32260869, ..., -1.42828867,
         0.        ,  0.        ],
       [ 0.45273298, -0.29667799, -0.19747694, ..., -0.6090207 ,
         0.        ,  0.        ],
       ...,
       [-1.01928474, -0.08848932, -0.04685326, ...,  1.65526569,
         0.        ,  0.        ],
       [ 1.75157215,  3.05495297,  2.2274149 , ...,  1.06119884,
         0.        ,  0.        ],
       [-0.13030149, -0.36717227, -0.08400155, ..., -0.28310646,
         0.        ,  0.        ]])

In [33]:
# 7. Model Training - XGBoost Classifier
xgb_model = XGBClassifier(
    learning_rate=0.1, 
    max_depth=10, 
    n_estimators=200, 
    subsample=0.9,
    use_label_encoder=False,  # Disable label encoding if you're using newer XGBoost versions
    random_state=42
)

# 8. Training the Model
xgb_model.fit(X_train, y_train)



Parameters: { "use_label_encoder" } are not used.



In [34]:
# 9. Predictions
y_pred_train = xgb_model.predict(X_train)
y_pred_test = xgb_model.predict(X_test)

In [35]:
# 10. Evaluate the Model

# ROC AUC Score
roc_auc_train = roc_auc_score(y_train, xgb_model.predict_proba(X_train)[:, 1])
roc_auc_test = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])

# Precision and Recall
train_precision = precision_score(y_train, y_pred_train)
train_recall = recall_score(y_train, y_pred_train)

test_precision = precision_score(y_test, y_pred_test)
test_recall = recall_score(y_test, y_pred_test)

# Output the results
print(f"Train ROC AUC: {roc_auc_train}")
print(f"Test ROC AUC: {roc_auc_test}")
print(f"Train Precision: {train_precision}, Train Recall: {train_recall}")
print(f"Test Precision: {test_precision}, Test Recall: {test_recall}")


Train ROC AUC: 0.9999999975520018
Test ROC AUC: 0.9999996078182893
Train Precision: 0.9999927569522911, Train Recall: 0.9999932743093621
Test Precision: 0.9998661634734799, Test Recall: 0.9999397088419395


In [36]:
from sklearn.metrics import classification_report
import pandas as pd

# Generate the classification report
report = classification_report(y_test, y_pred_test, output_dict=True)

# Convert the report into a DataFrame
report_df = pd.DataFrame(report).transpose()

# Improve the format: round to 3 decimal places and set the index name
report_df = report_df.round(3)
report_df.index.name = 'Metrics'

# Display the improved classification report
print(report_df)


              precision  recall  f1-score    support
Metrics                                             
0                   1.0     1.0       1.0   828009.0
1                   1.0     1.0       1.0   829309.0
accuracy            1.0     1.0       1.0        1.0
macro avg           1.0     1.0       1.0  1657318.0
weighted avg        1.0     1.0       1.0  1657318.0


In [37]:
import joblib

# Save the model
joblib.dump(xgb_model, 'xgb_model.joblib')

['xgb_model.joblib']

In [38]:
# Load the model (when needed)
loaded_model = joblib.load('xgb_model.joblib')

In [40]:
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']