In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE  # if using oversampling
from tensorflow.keras.layers import LSTM, Bidirectional
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


In [3]:
# 1. Load data
df = pd.read_csv('Synthetic_Financial_datasets_log.csv')

In [4]:

# First, let's load your dataset (adjust the filename if needed)
# df = pd.read_csv('your_fraud_dataset.csv')

# If you already have it loaded, let's explore it
print("=" * 50)
print("DATASET OVERVIEW")
print("=" * 50)

# Basic info
print(f"\nDataset shape: {df.shape}")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

print("\n" + "=" * 50)
print("COLUMN NAMES")
print("=" * 50)
print(df.columns.tolist())

print("\n" + "=" * 50)
print("DATA TYPES")
print("=" * 50)
print(df.dtypes)

print("\n" + "=" * 50)
print("FIRST 5 ROWS")
print("=" * 50)
print(df.head())

print("\n" + "=" * 50)
print("BASIC STATISTICS")
print("=" * 50)
print(df.describe())

print("\n" + "=" * 50)
print("MISSING VALUES")
print("=" * 50)
print(df.isnull().sum())

print("\n" + "=" * 50)
print("CATEGORICAL COLUMNS (object type)")
print("=" * 50)
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(categorical_cols)

for col in categorical_cols:
    print(f"\n{col} - Unique values:")
    print(df[col].value_counts())

print("\n" + "=" * 50)
print("TARGET VARIABLE DISTRIBUTION")
print("=" * 50)
# Common names for fraud column: 'isFraud', 'is_fraud', 'fraud', 'Class'
fraud_col_names = ['isFraud', 'is_fraud', 'fraud', 'Class', 'label']
for col_name in fraud_col_names:
    if col_name in df.columns:
        print(f"\n{col_name} distribution:")
        print(df[col_name].value_counts())
        print(f"\nFraud percentage: {(df[col_name].sum() / len(df)) * 100:.2f}%")
        break

DATASET OVERVIEW

Dataset shape: (6362620, 11)
Number of rows: 6362620
Number of columns: 11

COLUMN NAMES
['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']

DATA TYPES
step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

FIRST 5 ROWS
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720 

In [5]:
# Data Preparation and Preprocessing

print("=" * 50)
print("DATA PREPARATION")
print("=" * 50)

# Identify target column (adjust based on your dataset exploration from cell 4)
target_col = 'isFraud'  # Change this based on your actual target column name

# Separate features and target
X = df.drop(columns=[target_col])
y = df[target_col]

print(f"\nOriginal feature shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Handle categorical variables
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

if categorical_cols:
    print(f"\nCategorical columns found: {categorical_cols}")
    
    # Create lists to track columns
    cols_to_drop = []
    cols_to_encode = []
    
    # Check cardinality of each categorical column
    for col in categorical_cols:
        n_unique = X[col].nunique()
        print(f"\n{col}: {n_unique} unique values")
        
        # Drop high-cardinality columns (e.g., IDs, account numbers)
        if n_unique > 100:  # Adjust threshold as needed
            print(f"  → Dropping {col} due to high cardinality")
            cols_to_drop.append(col)
        else:
            print(f"  → Will encode {col}")
            cols_to_encode.append(col)
    
    # Drop high-cardinality columns
    if cols_to_drop:
        X = X.drop(columns=cols_to_drop)
        print(f"\nDropped {len(cols_to_drop)} high-cardinality columns")
    
    # One-hot encode remaining categorical columns
    if cols_to_encode:
        print(f"\nOne-hot encoding {len(cols_to_encode)} categorical columns...")
        X = pd.get_dummies(X, columns=cols_to_encode, drop_first=True)
        print(f"Shape after encoding: {X.shape}")
else:
    print("\nNo categorical columns found")

# Convert all columns to numeric
X = X.apply(pd.to_numeric, errors='coerce')

# Fill any NaN values created during conversion
nan_count = X.isnull().sum().sum()
if nan_count > 0:
    print(f"\nFilling {nan_count} NaN values with 0")
    X = X.fillna(0)

print(f"\nFinal feature shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")

# Split data
print("\n" + "=" * 50)
print("TRAIN-TEST SPLIT")
print("=" * 50)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Handle class imbalance with SMOTE
print("\n" + "=" * 50)
print("HANDLING CLASS IMBALANCE WITH SMOTE")
print("=" * 50)
print(f"Before SMOTE - Class distribution:\n{np.bincount(y_train)}")

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"\nAfter SMOTE - Class distribution:\n{np.bincount(y_train_resampled)}")
print(f"Training set after SMOTE: {X_train_resampled.shape[0]} samples")

# Scale features
print("\n" + "=" * 50)
print("FEATURE SCALING")
print("=" * 50)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)
print("Features scaled using StandardScaler")

# Reshape for LSTM (samples, timesteps, features)
print("\n" + "=" * 50)
print("RESHAPING FOR LSTM")
print("=" * 50)
X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

print(f"Training set shape: {X_train_lstm.shape}")
print(f"  → Samples: {X_train_lstm.shape[0]}")
print(f"  → Timesteps: {X_train_lstm.shape[1]}")
print(f"  → Features: {X_train_lstm.shape[2]}")

print(f"\nTest set shape: {X_test_lstm.shape}")
print(f"  → Samples: {X_test_lstm.shape[0]}")
print(f"  → Timesteps: {X_test_lstm.shape[1]}")
print(f"  → Features: {X_test_lstm.shape[2]}")

print("\n" + "=" * 50)
print("DATA PREPARATION COMPLETE")
print("=" * 50)

DATA PREPARATION

Original feature shape: (6362620, 10)
Target shape: (6362620,)

Original feature shape: (6362620, 10)
Target shape: (6362620,)

Original feature shape: (6362620, 10)
Target shape: (6362620,)

Categorical columns found: ['type', 'nameOrig', 'nameDest']

type: 5 unique values
  → Will encode type

Categorical columns found: ['type', 'nameOrig', 'nameDest']

type: 5 unique values
  → Will encode type

Categorical columns found: ['type', 'nameOrig', 'nameDest']

type: 5 unique values
  → Will encode type

nameOrig: 6353307 unique values
  → Dropping nameOrig due to high cardinality

nameOrig: 6353307 unique values
  → Dropping nameOrig due to high cardinality

nameOrig: 6353307 unique values
  → Dropping nameOrig due to high cardinality

nameDest: 2722362 unique values
  → Dropping nameDest due to high cardinality

nameDest: 2722362 unique values
  → Dropping nameDest due to high cardinality

nameDest: 2722362 unique values
  → Dropping nameDest due to high cardinality

D

In [6]:
# Function to create LSTM model for hyperparameter tuning
def create_lstm_model(lstm_units=64, dropout_rate=0.2, learning_rate=0.001, 
                      num_lstm_layers=1, bidirectional=False):
    model = Sequential()
    
    # First LSTM layer
    if bidirectional:
        model.add(Bidirectional(LSTM(lstm_units, return_sequences=(num_lstm_layers > 1), 
                                     input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]))))
    else:
        model.add(LSTM(lstm_units, return_sequences=(num_lstm_layers > 1), 
                      input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
    model.add(Dropout(dropout_rate))
    
    # Additional LSTM layers
    for i in range(1, num_lstm_layers):
        if bidirectional:
            model.add(Bidirectional(LSTM(lstm_units // (2**i), return_sequences=(i < num_lstm_layers - 1))))
        else:
            model.add(LSTM(lstm_units // (2**i), return_sequences=(i < num_lstm_layers - 1)))
        model.add(Dropout(dropout_rate))
    
    # Dense layers
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile model
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', 
                 metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
    
    return model

# Test the model creation
test_model = create_lstm_model()
test_model.summary()

  super().__init__(**kwargs)


In [13]:
# Lighter hyperparameter tuning - FIXED VERSION

print("Starting hyperparameter tuning...")

param_grid = [
    {'lstm_units': 64, 'dropout_rate': 0.2, 'learning_rate': 0.001, 'num_lstm_layers': 1, 'bidirectional': False, 'batch_size': 128},
    {'lstm_units': 128, 'dropout_rate': 0.3, 'learning_rate': 0.0001, 'num_lstm_layers': 1, 'bidirectional': True, 'batch_size': 128},
]

best_score = 0
best_params = None
results = []

for i, params in enumerate(param_grid, 1):
    print(f"\n{'='*50}")
    print(f"Training configuration {i}/{len(param_grid)}")
    print(f"{'='*50}")
    print(f"Parameters: {params}")
    
    model = create_lstm_model(
        lstm_units=params['lstm_units'],
        dropout_rate=params['dropout_rate'],
        learning_rate=params['learning_rate'],
        num_lstm_layers=params['num_lstm_layers'],
        bidirectional=params['bidirectional']
    )
    
    history = model.fit(
        X_train_lstm, y_train_resampled,
        validation_split=0.2,
        epochs=10,  # Reduced for speed
        batch_size=params['batch_size'],
        callbacks=[EarlyStopping(monitor='loss', patience=3, restore_best_weights=True, verbose=0)],
        verbose=1
    )
    
    # Use training AUC instead of validation AUC (validation is buggy with huge datasets)
    train_auc = max(history.history['auc'])
    print(f"Training AUC: {train_auc:.4f}")
    
    results.append({'params': params, 'train_auc': train_auc})
    
    if train_auc > best_score:
        best_score = train_auc
        best_params = params
        print("★ New best model!")

print("\n" + "="*50)
print("BEST HYPERPARAMETERS")
print("="*50)
print(best_params)
print(f"\nBest Training AUC: {best_score:.4f}")

Starting hyperparameter tuning...

Training configuration 1/2
Parameters: {'lstm_units': 64, 'dropout_rate': 0.2, 'learning_rate': 0.001, 'num_lstm_layers': 1, 'bidirectional': False, 'batch_size': 128}
Epoch 1/10
Epoch 1/10
Epoch 1/10
[1m 3112/63545[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:01[0m 2ms/step - accuracy: 0.9085 - auc: 0.9635 - loss: 0.2272

KeyboardInterrupt: 

In [11]:
print(best_params)

None


In [12]:
# Diagnostic check
print("Checking data shapes:")
print(f"X_train_lstm shape: {X_train_lstm.shape}")
print(f"y_train_resampled shape: {y_train_resampled.shape}")
print(f"y_train_resampled unique values: {np.unique(y_train_resampled)}")
print(f"y_train_resampled distribution: {np.bincount(y_train_resampled)}")

# Check if data has any NaN or inf
print(f"\nNaN in X_train_lstm: {np.isnan(X_train_lstm).sum()}")
print(f"Inf in X_train_lstm: {np.isinf(X_train_lstm).sum()}")

# Test a simple model
print("\nTesting a simple model...")
test_model = create_lstm_model(lstm_units=32, dropout_rate=0.2, learning_rate=0.001)
test_history = test_model.fit(
    X_train_lstm[:1000], y_train_resampled[:1000],  # Small sample
    validation_split=0.2,
    epochs=5,
    batch_size=32,
    verbose=1
)
print(f"\nTest AUC: {max(test_history.history['auc']):.4f}")

Checking data shapes:
X_train_lstm shape: (10167052, 1, 11)
y_train_resampled shape: (10167052,)
y_train_resampled unique values: [0 1]
y_train_resampled distribution: [5083526 5083526]

NaN in X_train_lstm: 0

NaN in X_train_lstm: 0

NaN in X_train_lstm: 0
Inf in X_train_lstm: 0

Testing a simple model...
Inf in X_train_lstm: 0

Testing a simple model...
Inf in X_train_lstm: 0

Testing a simple model...
Epoch 1/5
Epoch 1/5
Epoch 1/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9225 - auc: 0.7140 - loss: 0.6198 - val_accuracy: 1.0000 - val_auc: 0.0000e+00 - val_loss: 0.5407
Epoch 2/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9225 - auc: 0.7140 - loss: 0.6198 - val_accuracy: 1.0000 - val_auc: 0.0000e+00 - val_loss: 0.5407
Epoch 2/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9225 - auc: 0.7140 - loss: 0.6198 - val_accuracy: 1.0000 - val_auc: 0.0000e+00 - v

In [9]:
# Train final model with best parameters

# Extract parameters from best_params dictionary
lstm_units = best_params['lstm_units']
dropout_rate = best_params['dropout_rate']
learning_rate = best_params['learning_rate']
num_lstm_layers = best_params['num_lstm_layers']
bidirectional = best_params['bidirectional']
batch_size = best_params['batch_size']
epochs = 50  # Full training epochs

# Create best model
best_model = create_lstm_model(
    lstm_units=lstm_units,
    dropout_rate=dropout_rate,
    learning_rate=learning_rate,
    num_lstm_layers=num_lstm_layers,
    bidirectional=bidirectional
)

# Setup callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

# Train model
print("\n" + "="*50)
print("TRAINING FINAL MODEL")
print("="*50)

history = best_model.fit(
    X_train_lstm, y_train_resampled,
    validation_split=0.2,
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[early_stopping],
    verbose=1
)

TypeError: 'NoneType' object is not subscriptable

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Loss
axes[0].plot(history.history['loss'], label='Training Loss')
axes[0].plot(history.history['val_loss'], label='Validation Loss')
axes[0].set_title('Model Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(True)

# Accuracy
axes[1].plot(history.history['accuracy'], label='Training Accuracy')
axes[1].plot(history.history['val_accuracy'], label='Validation Accuracy')
axes[1].set_title('Model Accuracy')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].legend()
axes[1].grid(True)

# AUC
axes[2].plot(history.history['auc'], label='Training AUC')
axes[2].plot(history.history['val_auc'], label='Validation AUC')
axes[2].set_title('Model AUC')
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('AUC')
axes[2].legend()
axes[2].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Evaluate model
y_pred_proba = best_model.predict(X_test_lstm)
y_pred = (y_pred_proba > 0.5).astype(int)

print("\n" + "="*50)
print("MODEL EVALUATION ON TEST SET")
print("="*50)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# ROC-AUC Score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"\nROC-AUC Score: {roc_auc:.4f}")

# Additional metrics
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

In [None]:
# Save the model
best_model.save('fraud_detection_lstm_model.h5')
print("\nModel saved as 'fraud_detection_lstm_model.h5'")

# Save scaler for future use
import pickle
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("Scaler saved as 'scaler.pkl'")