In [3]:
import pandas as pd
import numpy as np

In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE

# 1. **Load the dataset**
df = pd.read_csv('transaction.csv')

# 2. **Preprocessing**

# Retain only relevant transaction types (CASH_OUT and TRANSFER)
df = df[df['type'].isin(['CASH_OUT', 'TRANSFER'])]

# Remove zero or negative amounts
df = df[df['amount'] > 0]

# Create balance inaccuracies
df['origBalance_inacc'] = (df['oldbalanceOrg'] - df['amount']) - df['newbalanceOrig']
df['destBalance_inacc'] = (df['oldbalanceDest'] + df['amount']) - df['newbalanceDest']

# Drop unnecessary columns (nameOrig and nameDest)
df = df.drop(columns=['nameOrig', 'nameDest'])

# One-hot encode the 'type' column
df = pd.get_dummies(df, columns=['type'], prefix='type')

# 3. **Splitting the dataset**
X = df.drop(columns=['isFraud'])
y = df['isFraud']

# 4. **Train-test split**
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 5. **Handling Class Imbalance using SMOTE**
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# 6. **Standardize the dataset**
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

# 7. **Random Forest Model Creation**
rf_model = RandomForestClassifier(
    n_estimators=200, 
    max_depth=None, 
    min_samples_split=2, 
    min_samples_leaf=1, 
    random_state=42
)

# 8. **Training the Model**
rf_model.fit(X_train_resampled, y_train_resampled)

# 9. **Make Predictions**
y_pred_train = rf_model.predict(X_train_resampled)
y_pred_test = rf_model.predict(X_test)

# 10. **Evaluate the Model**

# ROC AUC Score
roc_auc_train = roc_auc_score(y_train_resampled, rf_model.predict_proba(X_train_resampled)[:, 1])
roc_auc_test = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])

# Precision and Recall
train_precision = precision_score(y_train_resampled, y_pred_train)
train_recall = recall_score(y_train_resampled, y_pred_train)

test_precision = precision_score(y_test, y_pred_test)
test_recall = recall_score(y_test, y_pred_test)

# Output the results
print(f"Train ROC AUC: {roc_auc_train}")
print(f"Test ROC AUC: {roc_auc_test}")
print(f"Train Precision: {train_precision}, Train Recall: {train_recall}")
print(f"Test Precision: {test_precision}, Test Recall: {test_recall}")


Train ROC AUC: 1.0
Test ROC AUC: 0.9983366016709728
Train Precision: 1.0, Train Recall: 1.0
Test Precision: 0.9737574552683896, Test Recall: 0.9959333062220415


In [5]:
from sklearn.metrics import classification_report
import pandas as pd

# Generate the classification report
report = classification_report(y_test, y_pred_test, output_dict=True)

# Convert the report into a DataFrame
report_df = pd.DataFrame(report).transpose()

# Improve the format: round to 3 decimal places and set the index name
report_df = report_df.round(3)
report_df.index.name = 'Metrics'

# Display the improved classification report
print(report_df)


              precision  recall  f1-score   support
Metrics                                            
0                 1.000   1.000     1.000  828659.0
1                 0.974   0.996     0.985    2459.0
accuracy          1.000   1.000     1.000       1.0
macro avg         0.987   0.998     0.992  831118.0
weighted avg      1.000   1.000     1.000  831118.0
