In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
import zipfile
import os
import pickle
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Fix for pandas pickle compatibility
import sys
import pandas.core.indexes.base as base

# Create proper Int64Index wrapper
class Int64Index(pd.Index):
    def __new__(cls, data=None, dtype=None, copy=False, name=None):
        return pd.Index(data, dtype=dtype, copy=copy, name=name)

class UInt64Index(pd.Index):
    def __new__(cls, data=None, dtype=None, copy=False, name=None):
        return pd.Index(data, dtype=dtype, copy=copy, name=name)

class Float64Index(pd.Index):
    def __new__(cls, data=None, dtype=None, copy=False, name=None):
        return pd.Index(data, dtype=dtype, copy=copy, name=name)

# Register in the base module
base.Int64Index = Int64Index
base.UInt64Index = UInt64Index
base.Float64Index = Float64Index

# Create numeric module
class NumericModule:
    Int64Index = Int64Index
    UInt64Index = UInt64Index
    Float64Index = Float64Index

sys.modules['pandas.core.indexes.numeric'] = NumericModule()


In [None]:
# Extract dataset
with zipfile.ZipFile('dataset.zip', 'r') as zip_ref:
    zip_ref.extractall()

# Load all PKL files
data_folder = 'data'
all_files = [f for f in os.listdir(data_folder) if f.endswith('.pkl')]
all_files.sort()

print(f"Found {len(all_files)} PKL files")

df_list = []
for file in all_files:
    file_path = os.path.join(data_folder, file)
    with open(file_path, 'rb') as f:
        df_temp = pickle.load(f)
        df_list.append(df_temp)

df = pd.concat(df_list, ignore_index=True)
print(f"Total transactions: {len(df)}")
print(f"Fraud transactions: {df['TX_FRAUD'].sum()} ({df['TX_FRAUD'].mean()*100:.2f}%)")


In [None]:
# Convert datetime
df['TX_DATETIME'] = pd.to_datetime(df['TX_DATETIME'])
df = df.sort_values('TX_DATETIME').reset_index(drop=True)

# Extract time features
df['TX_HOUR'] = df['TX_DATETIME'].dt.hour
df['TX_DAY'] = df['TX_DATETIME'].dt.day
df['TX_WEEKDAY'] = df['TX_DATETIME'].dt.weekday
df['TX_MONTH'] = df['TX_DATETIME'].dt.month


In [None]:
# Customer behavior features
customer_stats = df.groupby('CUSTOMER_ID').agg({
    'TX_AMOUNT': ['mean', 'std', 'max', 'min', 'count']
}).reset_index()
customer_stats.columns = ['CUSTOMER_ID', 'CUST_AVG', 'CUST_STD', 'CUST_MAX', 'CUST_MIN', 'CUST_TX_COUNT']
df = df.merge(customer_stats, on='CUSTOMER_ID', how='left')

df['AMOUNT_VS_AVG'] = df['TX_AMOUNT'] / (df['CUST_AVG'] + 1)
df['AMOUNT_VS_MAX'] = df['TX_AMOUNT'] / (df['CUST_MAX'] + 1)

# Terminal risk features
terminal_stats = df.groupby('TERMINAL_ID').agg({
    'TX_FRAUD': 'mean',
    'TX_AMOUNT': 'mean',
    'CUSTOMER_ID': 'count'
}).reset_index()
terminal_stats.columns = ['TERMINAL_ID', 'TERMINAL_FRAUD_RATE', 'TERMINAL_AVG_AMOUNT', 'TERMINAL_TX_COUNT']
df = df.merge(terminal_stats, on='TERMINAL_ID', how='left')

# High amount flag (scenario 1)
df['HIGH_AMOUNT_FLAG'] = (df['TX_AMOUNT'] > 220).astype(int)


In [None]:
# Feature selection
features = [
    'TX_AMOUNT', 'TX_HOUR', 'TX_DAY', 'TX_WEEKDAY', 'TX_MONTH',
    'CUST_AVG', 'CUST_STD', 'CUST_MAX', 'CUST_MIN', 'CUST_TX_COUNT',
    'AMOUNT_VS_AVG', 'AMOUNT_VS_MAX',
    'TERMINAL_FRAUD_RATE', 'TERMINAL_AVG_AMOUNT', 'TERMINAL_TX_COUNT',
    'HIGH_AMOUNT_FLAG'
]

X = df[features].fillna(0)
y = df['TX_FRAUD']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

# Train Random Forest model
print("\nTraining model...")
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=12,
    min_samples_split=10,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]


In [None]:
# Evaluation metrics
print("\n" + "="*50)
print("MODEL PERFORMANCE")
print("="*50)

print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Fraud']))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f"\nTrue Negatives: {cm[0,0]}")
print(f"False Positives: {cm[0,1]}")
print(f"False Negatives: {cm[1,0]}")
print(f"True Positives: {cm[1,1]}")

# Feature importance
print("\n" + "="*50)
print("FEATURE IMPORTANCE")
print("="*50)
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print(feature_importance.to_string(index=False))