In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [2]:
# Load the dataset
data = pd.read_csv('C:/Users/bam/Desktop/Week-6/notebooks/processed_bati_bank_data.csv')  

In [7]:
# Check the DataFrame
print(data.head())  
print(data.columns)  

# Define Features and Target Variable
X = data[['Amount', 'Value', 'normalized_amount', 'standardized_amount', 'PricingStrategy', 
           'transaction_hour', 'transaction_day', 'transaction_month']]  
y = data['FraudResult']  

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId    Amount     Value       TransactionStartTime  \
0  CustomerId_4406 -0.046371 -0.072291  2018-11-15 02:18:49+00:00   
1  CustomerId_4406 -0.054643 -0.080251  2018-11-15 02:19:08+00:00   
2  CustomerId_4683 -0.050426 -0.076352  2018-11-15 02:44:21+00:00   
3   CustomerId_988  0.107717  0.096648  2018-11-15 03:32:55+00:00   
4   CustomerId_988 -0.059704 -0.075183  2018-11-15 03:34:21+00:00   

   PricingStrategy  FraudResult  ...  ProductCategory_other  \
0                2            0  ...           

In [8]:
# Train Logistic Regression Model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# Train Random Forest Model
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train, y_train)

# Make Predictions
y_pred_logistic = logistic_model.predict(X_test)
y_pred_rf = random_forest_model.predict(X_test)

# Evaluate the Models
print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_logistic)}")
print(f"Precision: {precision_score(y_test, y_pred_logistic)}")
print(f"Recall: {recall_score(y_test, y_pred_logistic)}")
print(f"F1 Score: {f1_score(y_test, y_pred_logistic)}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_logistic)}")

print("\nRandom Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")
print(f"Precision: {precision_score(y_test, y_pred_rf)}")
print(f"Recall: {recall_score(y_test, y_pred_rf)}")
print(f"F1 Score: {f1_score(y_test, y_pred_rf)}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_rf)}")


Logistic Regression Performance:
Accuracy: 0.9983797627136362
Precision: 0.7777777777777778
Recall: 0.19444444444444445
F1 Score: 0.3111111111111111
ROC-AUC: 0.5971698579765292

Random Forest Performance:
Accuracy: 0.9997386714054252
Precision: 0.8974358974358975
Recall: 0.9722222222222222
F1 Score: 0.9333333333333333
ROC-AUC: 0.9860063826197251


In [9]:
# Hyperparameter Tuning with Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=random_forest_model, param_grid=param_grid, 
                           scoring='accuracy', cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters found
print("Best parameters found: ", grid_search.best_params_)

# Best estimator performance
best_rf_model = grid_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)

print("Tuned Random Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best_rf)}")
print(f"Precision: {precision_score(y_test, y_pred_best_rf)}")
print(f"Recall: {recall_score(y_test, y_pred_best_rf)}")
print(f"F1 Score: {f1_score(y_test, y_pred_best_rf)}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_best_rf)}")


Best parameters found:  {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 50}
Tuned Random Forest Performance:
Accuracy: 0.9997386714054252
Precision: 0.918918918918919
Recall: 0.9444444444444444
F1 Score: 0.9315068493150684
ROC-AUC: 0.9721436758536827
