In [1]:
import pandas as pd
import numpy as np
import joblib
import os
import shap

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

In [2]:
# Define paths to the datasets
data1_path = '../data/TransactionDataset1.csv'
data2_path = '../data/credit-debit dataset.csv'

# --- Load Data with Error Handling ---
try:
    data1 = pd.read_csv(data1_path)
    data2 = pd.read_csv(data2_path)
    print("Datasets loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please make sure you have run the notebooks in the '01_data_generation' folder first.")

# --- Robust Merging of Datasets ---
# Rename columns to be consistent for the merge
data2.rename(columns={'User ID': 'user_id'}, inplace=True)

# Perform a left merge to combine data based on the user_id
# This is a much more reliable way to combine datasets.
combined_data = pd.merge(data1, data2, on='user_id', how='left')

print(f"Merged data shape: {combined_data.shape}")

Datasets loaded successfully.
Merged data shape: (20000, 31)


In [3]:
def preprocess_data(data):
    # --- FIX: Handle the two different fraud indicator columns ---
    # We check for both column names. If 'Fraud Indicator' exists, we use its values.
    # Otherwise, we use 'fraud_indicator'. This creates a single, reliable target column.
    if 'Fraud Indicator' in data.columns and 'fraud_indicator' in data.columns:
        # Combine the two columns. Use 'Fraud Indicator' if available, else use 'fraud_indicator'.
        # .fillna() ensures we take the value from the second column if the first is NaN.
        data['final_fraud_indicator'] = data['Fraud Indicator'].fillna(data['fraud_indicator'])
    elif 'Fraud Indicator' in data.columns:
        data['final_fraud_indicator'] = data['Fraud Indicator']
    else:
        data['final_fraud_indicator'] = data['fraud_indicator']

    # Select features that are relevant for the model
    selected_features = [
        'age', 'kyc_status', 'days_since_kyc_incomplete', 'transaction_amount',
        'home_branch', 'transaction_location', 'transaction_method',
        'transaction_category', 'transaction_merchant', 'transaction_time',
        'average_expenditure', 'comparison_with_avg_expenditure',
        'transaction_count_7_days', 'suspicion_indicator',
        'Total Credit Amount', 'Transaction Amount',
        'final_fraud_indicator'  # Use our new combined column
    ]
    
    # Use a copy to avoid warnings
    data_subset = data[selected_features].copy()

    # Rename the target column to the standard 'fraud_indicator' for consistency
    data_subset.rename(columns={'final_fraud_indicator': 'fraud_indicator'}, inplace=True)

    # Handle missing values by dropping rows with any nulls
    data_subset.dropna(inplace=True)

    # Label encode all categorical (object) columns
    label_encoder = LabelEncoder()
    for col in data_subset.select_dtypes(include=['object']).columns:
        data_subset[col] = label_encoder.fit_transform(data_subset[col].astype(str))
    
    return data_subset

preprocessed_data = preprocess_data(combined_data)

# Separate features (X) and target (y)
X = preprocessed_data.drop('fraud_indicator', axis=1)
y = preprocessed_data['fraud_indicator']

print("Preprocessing complete.")
print(f"Features for model: {X.columns.to_list()}")
print(f"Target distribution:\n{y.value_counts()}")

Preprocessing complete.
Features for model: ['age', 'kyc_status', 'days_since_kyc_incomplete', 'transaction_amount', 'home_branch', 'transaction_location', 'transaction_method', 'transaction_category', 'transaction_merchant', 'transaction_time', 'average_expenditure', 'comparison_with_avg_expenditure', 'transaction_count_7_days', 'suspicion_indicator', 'Total Credit Amount', 'Transaction Amount']
Target distribution:
fraud_indicator
0    15853
1     4147
Name: count, dtype: int64


In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values (e.g., if columns are entirely NaN after split) using the mean of the training data
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# --- Model Training ---
# We use a smaller n_estimators=100 for faster training on a local machine.
# n_jobs=-1 uses all available CPU cores.
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced')

print("Training the RandomForest model...")
model.fit(X_train_scaled, y_train)
print("Model training complete.")

Training the RandomForest model...
Model training complete.


In [5]:
# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Display evaluation metrics
print("\n--- Model Evaluation ---")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


--- Model Evaluation ---

Confusion Matrix:
[[2960  191]
 [ 530  319]]

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.94      0.89      3151
           1       0.63      0.38      0.47       849

    accuracy                           0.82      4000
   macro avg       0.74      0.66      0.68      4000
weighted avg       0.80      0.82      0.80      4000



In [6]:
# --- Create and Save the Model and SHAP Explainer (USP Integration) ---

# This is the key step of Phase 2. We create a SHAP explainer object
# and save it along with our trained model and scaler.

print("\n--- Starting Artifact Creation ---")
print("This step can take a few minutes depending on your computer's performance.")

try:
    # 1. Create the SHAP Explainer
    # We use TreeExplainer for tree-based models like Random Forest.
    print("\nStep 1: Creating SHAP explainer... (This may take a moment)")
    explainer = shap.TreeExplainer(model)
    print("SHAP explainer created successfully.")

    # 2. Bundle all artifacts into a single dictionary
    print("\nStep 2: Bundling model, scaler, and explainer...")
    artifacts = {
        'model': model,
        'scaler': scaler,
        'explainer': explainer,
        'features': X.columns.tolist()
    }
    print("Artifacts bundled successfully.")

    # 3. Define the file path for saving
    model_path = '../models/fraud_detection_artifacts.pkl'
    print(f"\nStep 3: Preparing to save artifacts to {model_path}")

    # 4. Save the dictionary to a .pkl file using joblib
    joblib.dump(artifacts, model_path)
    print("\nSUCCESS: Model, scaler, and SHAP explainer have been saved.")
    print("---------------------------------------------------------")

except Exception as e:
    print(f"\nAn error occurred during artifact creation: {e}")


--- Starting Artifact Creation ---
This step can take a few minutes depending on your computer's performance.

Step 1: Creating SHAP explainer... (This may take a moment)
SHAP explainer created successfully.

Step 2: Bundling model, scaler, and explainer...
Artifacts bundled successfully.

Step 3: Preparing to save artifacts to ../models/fraud_detection_artifacts.pkl

SUCCESS: Model, scaler, and SHAP explainer have been saved.
---------------------------------------------------------


In [1]:
# ==============================================================================
# Cell 1: Imports
# ==============================================================================
import pandas as pd
import numpy as np
import joblib
import os
import shap

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# ==============================================================================
# Cell 2: Load and Merge Data
# ==============================================================================
print("--- Starting Phase 2: Model Training and Debugging ---")

# Define paths
data1_path = '../data/TransactionDataset1.csv'
data2_path = '../data/credit-debit dataset.csv'

# Load data
data1 = pd.read_csv(data1_path)
data2 = pd.read_csv(data2_path)
print("Datasets loaded successfully.")

# Prepare for merge by standardizing the key column name
data2.rename(columns={'User ID': 'user_id'}, inplace=True)

# Perform the merge
combined_data = pd.merge(data1, data2, on='user_id', how='left')
print(f"Merged data shape: {combined_data.shape}")

# --- DEBUG CHECK 1: Check fraud distribution right after merging ---
print("\n[DEBUG 1] Fraud distribution in RAW MERGED data:")
# We combine the two possible fraud columns into one for this check
if 'Fraud Indicator' in combined_data.columns:
    raw_fraud_counts = combined_data['Fraud Indicator'].fillna(combined_data['fraud_indicator']).value_counts(dropna=False)
else:
    raw_fraud_counts = combined_data['fraud_indicator'].value_counts(dropna=False)
print(raw_fraud_counts)
print("-" * 50)


# ==============================================================================
# Cell 3: Preprocess Data
# ==============================================================================
def preprocess_data(data):
    # --- FIX: Handle the two different fraud indicator columns ---
    if 'Fraud Indicator' in data.columns and 'fraud_indicator' in data.columns:
        data['final_fraud_indicator'] = data['Fraud Indicator'].fillna(data['fraud_indicator'])
    elif 'Fraud Indicator' in data.columns:
        data['final_fraud_indicator'] = data['Fraud Indicator']
    else:
        data['final_fraud_indicator'] = data['fraud_indicator']

    # Select features that are relevant for the model
    selected_features = [
        'age', 'kyc_status', 'days_since_kyc_incomplete', 'transaction_amount',
        'home_branch', 'transaction_location', 'transaction_method',
        'transaction_category', 'transaction_merchant', 'transaction_time',
        'average_expenditure', 'comparison_with_avg_expenditure',
        'transaction_count_7_days', 'suspicion_indicator',
        'Total Credit Amount', 'Transaction Amount',
        'final_fraud_indicator'
    ]
    
    data_subset = data[selected_features].copy()
    data_subset.rename(columns={'final_fraud_indicator': 'fraud_indicator'}, inplace=True)

    # --- DEBUG CHECK 2: Check fraud distribution BEFORE dropping missing values ---
    print("\n[DEBUG 2] Fraud distribution BEFORE dropna():")
    print(data_subset['fraud_indicator'].value_counts(dropna=False))
    print(f"Shape before dropna: {data_subset.shape}")

    # Handle missing values by dropping rows with any nulls
    data_subset.dropna(inplace=True)

    # --- DEBUG CHECK 3: Check fraud distribution AFTER dropping missing values ---
    print("\n[DEBUG 3] Fraud distribution AFTER dropna():")
    print(data_subset['fraud_indicator'].value_counts(dropna=False))
    print(f"Shape after dropna: {data_subset.shape}")
    print("-" * 50)

    # Label encode all categorical (object) columns
    label_encoder = LabelEncoder()
    for col in data_subset.select_dtypes(include=['object']).columns:
        data_subset[col] = label_encoder.fit_transform(data_subset[col].astype(str))
    
    return data_subset

preprocessed_data = preprocess_data(combined_data)

# Separate features (X) and target (y)
X = preprocessed_data.drop('fraud_indicator', axis=1)
y = preprocessed_data['fraud_indicator']

# ==============================================================================
# Cell 4: Train the Machine Learning Model
# ==============================================================================
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # Added stratify

# --- DEBUG CHECK 4: Check fraud distribution in the TRAINING set ---
print("\n[DEBUG 4] Fraud distribution in the FINAL TRAINING data (y_train):")
print(y_train.value_counts())
print("-" * 50)

# Impute and Scale
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Model Training
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced')

print("\nTraining the RandomForest model...")
model.fit(X_train_scaled, y_train)
print("Model training complete.")

# ==============================================================================
# Cell 5: Evaluate the Model
# ==============================================================================
y_pred = model.predict(X_test_scaled)
print("\n--- Model Evaluation ---")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ==============================================================================
# Cell 6: Create and Save Artifacts
# ==============================================================================
print("\n--- Starting Artifact Creation ---")
explainer = shap.TreeExplainer(model)
artifacts = { 'model': model, 'scaler': scaler, 'explainer': explainer, 'features': X.columns.tolist() }
model_path = '../models/fraud_detection_artifacts.pkl'
joblib.dump(artifacts, model_path)
print(f"SUCCESS: Artifacts saved to: {model_path}")
print("-" * 50)

--- Starting Phase 2: Model Training and Debugging ---
Datasets loaded successfully.
Merged data shape: (20000, 31)

[DEBUG 1] Fraud distribution in RAW MERGED data:
Fraud Indicator
0    15853
1     4147
Name: count, dtype: int64
--------------------------------------------------

[DEBUG 2] Fraud distribution BEFORE dropna():
fraud_indicator
0    15853
1     4147
Name: count, dtype: int64
Shape before dropna: (20000, 17)

[DEBUG 3] Fraud distribution AFTER dropna():
fraud_indicator
0    15853
1     4147
Name: count, dtype: int64
Shape after dropna: (20000, 17)
--------------------------------------------------

[DEBUG 4] Fraud distribution in the FINAL TRAINING data (y_train):
fraud_indicator
0    12682
1     3318
Name: count, dtype: int64
--------------------------------------------------

Training the RandomForest model...
Model training complete.

--- Model Evaluation ---

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.94

In [None]:

import pandas as pd
import numpy as np
import joblib
import os
import shap
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# ==============================================================================
# Cell 2: Load and Merge Data
# ==============================================================================
print("--- Starting Robust Model Training ---")
data1 = pd.read_csv('../data/TransactionDataset1.csv')
data2 = pd.read_csv('../data/credit-debit dataset.csv')
data2.rename(columns={'User ID': 'user_id'}, inplace=True)
combined_data = pd.merge(data1, data2, on='user_id', how='left')
print(f"Merged data shape: {combined_data.shape}")

# ==============================================================================
# Cell 3: Preprocess Data (The Robust Way)
# ==============================================================================
# Combine the two fraud indicator columns into one reliable target column
if 'Fraud Indicator' in combined_data.columns:
    combined_data['fraud_indicator'] = combined_data['Fraud Indicator'].fillna(combined_data['fraud_indicator'])
else:
    # This case is just for safety
    combined_data['fraud_indicator'] = combined_data['fraud_indicator']

# Select all features and the final target
all_features = [
    'age', 'kyc_status', 'days_since_kyc_incomplete', 'transaction_amount',
    'home_branch', 'transaction_location', 'transaction_method',
    'transaction_category', 'transaction_merchant', 'transaction_time',
    'average_expenditure', 'comparison_with_avg_expenditure',
    'transaction_count_7_days', 'suspicion_indicator',
    'Total Credit Amount', 'Transaction Amount', 'fraud_indicator'
]
data_subset = combined_data[all_features].copy()

# --- Separate categorical and numerical features BEFORE imputing ---
categorical_cols = data_subset.select_dtypes(include=['object']).columns
numerical_cols = data_subset.select_dtypes(include=np.number).drop('fraud_indicator', axis=1).columns

# --- IMPUTATION (This is the key fix) ---
# Impute numerical columns with the mean
num_imputer = SimpleImputer(strategy='mean')
data_subset[numerical_cols] = num_imputer.fit_transform(data_subset[numerical_cols])

# Impute categorical columns with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
data_subset[categorical_cols] = cat_imputer.fit_transform(data_subset[categorical_cols])

# --- Label Encode Categorical Features ---
label_encoder = LabelEncoder()
for col in categorical_cols:
    data_subset[col] = label_encoder.fit_transform(data_subset[col])

print("Data preprocessing and imputation complete.")
print("\n--- SANITY CHECK: Final Fraud Distribution ---")
print(data_subset['fraud_indicator'].value_counts())
print("-" * 50)


# Separate final features (X) and target (y)
X = data_subset.drop('fraud_indicator', axis=1)
y = data_subset['fraud_indicator']

# ==============================================================================
# Cell 4: Train and Evaluate the Model
# ==============================================================================
# Split the data, ensuring both classes are represented in train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced')
print("Training the RandomForest model...")
model.fit(X_train_scaled, y_train)
print("Model training complete.")

# Evaluate the model
y_pred = model.predict(X_test_scaled)
print("\n--- Model Evaluation ---")
print(classification_report(y_test, y_pred))

# ==============================================================================
# Cell 5: Create and Save Final Artifacts
# ==============================================================================
print("\n--- Creating and Saving Final Artifacts ---")
explainer = shap.TreeExplainer(model)
artifacts = { 'model': model, 'scaler': scaler, 'explainer': explainer, 'features': X.columns.tolist() }
model_path = '../models/fraud_detection_artifacts.pkl'
joblib.dump(artifacts, model_path)
print(f"SUCCESS: New artifacts saved to: {model_path}")
print("-" * 50)

--- Starting Robust Model Training ---
Merged data shape: (20000, 31)
Data preprocessing and imputation complete.

--- SANITY CHECK: Final Fraud Distribution ---
fraud_indicator
0    15853
1     4147
Name: count, dtype: int64
--------------------------------------------------
Training the RandomForest model...
Model training complete.

--- Model Evaluation ---
              precision    recall  f1-score   support

           0       0.85      0.94      0.89      3171
           1       0.62      0.36      0.46       829

    accuracy                           0.82      4000
   macro avg       0.74      0.65      0.68      4000
weighted avg       0.80      0.82      0.80      4000


--- Creating and Saving Final Artifacts ---
SUCCESS: New artifacts saved to: ../models/fraud_detection_artifacts.pkl
--------------------------------------------------


In [3]:
# ==============================================================================
# Cell 1: Imports
# ==============================================================================
import pandas as pd
import numpy as np
import joblib
import os
import shap
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# ==============================================================================
# Cell 2: Load ONE Dataset
# ==============================================================================
print("--- Starting SIMPLIFIED Model Training ---")
data_path = '../data/TransactionDataset1.csv'
data = pd.read_csv(data_path)
print("Dataset 'TransactionDataset1.csv' loaded successfully.")
print("\n--- SANITY CHECK: Fraud Distribution in Raw Data ---")
print(data['fraud_indicator'].value_counts())
print("-" * 50)

# ==============================================================================
# Cell 3: Preprocess Data
# ==============================================================================
# Drop columns that are not useful for this simple model
cols_to_drop = ['user_id', 'name', 'addresses', 'email_address', 'transaction_id', 'transaction_date', 'home_branch', 'transaction_location', 'transaction_merchant']
data.drop(columns=cols_to_drop, inplace=True)

# Label Encode categorical features
for col in data.select_dtypes(include=['object']).columns:
    data[col] = LabelEncoder().fit_transform(data[col])
    
# Separate features and target
X = data.drop('fraud_indicator', axis=1)
y = data['fraud_indicator']

# Impute any potential missing values
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

print("Preprocessing Complete.")

# ==============================================================================
# Cell 4: Train and Evaluate
# ==============================================================================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train_scaled, y_train)

print("\n--- PROOF BEFORE SAVING ---")
print("Classes the new model was trained on:", model.classes_)
print("-" * 50)

# ==============================================================================
# Cell 5: Save the Correct Artifacts
# ==============================================================================
explainer = shap.TreeExplainer(model)
artifacts = { 'model': model, 'scaler': scaler, 'explainer': explainer, 'features': X.columns.tolist() }
model_path = '../models/fraud_detection_artifacts.pkl'
joblib.dump(artifacts, model_path)
print(f"\nSUCCESS: New, simplified artifacts saved to: {model_path}")

--- Starting SIMPLIFIED Model Training ---
Dataset 'TransactionDataset1.csv' loaded successfully.

--- SANITY CHECK: Fraud Distribution in Raw Data ---
fraud_indicator
0    12692
1     7308
Name: count, dtype: int64
--------------------------------------------------
Preprocessing Complete.

--- PROOF BEFORE SAVING ---
Classes the new model was trained on: [0 1]
--------------------------------------------------

SUCCESS: New, simplified artifacts saved to: ../models/fraud_detection_artifacts.pkl
