In [44]:
!pip install xgboost



In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, classification_report, confusion_matrix,
                            roc_auc_score, precision_recall_curve, auc)

# Setting s random seed
RANDOM_STATE = 42

In [39]:
df = pd.read_csv("Synthetic_Financial_datasets_log.csv")

In [40]:
df.shape

(6362620, 11)

In [41]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [42]:
# Check class distribution
print("\nFraud distribution:")
print(df['isFraud'].value_counts())
print(f"\nFraud percentage: {df['isFraud'].mean() * 100:.3f}%")


Fraud distribution:
0    6354407
1       8213
Name: isFraud, dtype: int64

Fraud percentage: 0.129%


In [43]:
# Create new features
def create_features(df):
    data = df.copy()
    
    le = LabelEncoder()
    data['type_encoded'] = le.fit_transform(data['type'])
    
    # Create dummy variables for transaction types
    type_dummies = pd.get_dummies(data['type'], prefix='type')
    data = pd.concat([data, type_dummies], axis=1)
    
    # Amount features
    data['amount_log'] = np.log1p(data['amount'])
    
    # Balance features
    data['orgBalanceDiff'] = data['oldbalanceOrg'] - data['newbalanceOrig']
    data['destBalanceDiff'] = data['newbalanceDest'] - data['oldbalanceDest']
    
    # Zero balance indicators
    data['orgBalanceBecomesZero'] = ((data['newbalanceOrig'] == 0) & 
                                    (data['oldbalanceOrg'] > 0)).astype(int)
    data['destBalanceBecomesZero'] = ((data['newbalanceDest'] == 0) & 
                                     (data['oldbalanceDest'] > 0)).astype(int)
    
    # Amount to balance ratios
    data['amountToOrgBalanceRatio'] = data['amount'] / (data['oldbalanceOrg'] + 1)
    
    return data

# Process data
processed_df = create_features(df)

MemoryError: 

In [None]:
# Drop unnecessary columns
cols_to_drop = ['nameOrig', 'nameDest', 'type', 'isFlaggedFraud']
processed_df = processed_df.drop(cols_to_drop, axis=1)

In [None]:
# Split features and target
X = processed_df.drop(['isFraud'], axis=1)
y = processed_df['isFraud']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=RANDOM_STATE, stratify=y
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Print class distribution
print(f"Class distribution in training set: {np.bincount(y_train)}")

In [None]:
# Function to evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
    
    # Print results
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    if roc_auc:
        print(f"ROC AUC: {roc_auc:.4f}")
    
    # Show confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(cm)
    
    # Return results
    return {
        'model': model,
        'name': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'predictions': y_pred,
        'probabilities': y_prob
    }

# Set up models with best parameters
# Note the increased class_weight and scale_pos_weight to handle imbalance
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        class_weight={0:1, 1:20},  # Higher weight for minority class
        random_state=RANDOM_STATE
    ),
    'XGBoost': XGBClassifier(
        learning_rate=0.1,
        n_estimators=100,
        max_depth=5,
        scale_pos_weight=20,  # Higher weight for minority class
        random_state=RANDOM_STATE
    ),
    'KNN': KNeighborsClassifier(
        n_neighbors=5,
        weights='distance'
    )
#     'SVM': SVC(
#         C=10,
#         kernel='rbf',
#         probability=True,
#         class_weight={0:1, 1:20},  # Higher weight for minority class
#         random_state=RANDOM_STATE
#     ), # Getting rid of this as it takes too long
    
}

In [None]:
# Train and evaluate each model
results = []
for name, model in models.items():
    result = evaluate_model(model, X_train_scaled, y_train, X_test_scaled, y_test, name)
    results.append(result)