# Fraud Detection Model Training

This notebook loads the dataset, preprocesses it, trains a Random Forest model, analyzes feature importance, and saves the model and feature columns for deployment. Visualizations are included for data exploration and feature importance.

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
import os

## 1. Load Data

In [2]:
df = pd.read_csv('Dataset.csv')
print('Initial shape:', df.shape)
df.head()

Initial shape: (535178, 10)


Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,11,'C1760492708','1','F','28007','M1823072687','28007','es_transportation',52.35,0
1,172,'C1222389110','3','M','28007','M1823072687','28007','es_transportation',14.75,0
2,106,'C879949041','3','F','28007','M1823072687','28007','es_transportation',3.73,0
3,86,'C409740988','5','M','28007','M1823072687','28007','es_transportation',61.42,0
4,152,'C671449181','3','F','28007','M692898500','28007','es_health',9.05,0


## 2. Data Cleaning & Preprocessing

In [3]:
# Clean columns
df.columns = df.columns.str.strip()
df.drop(['zipcodeOri', 'zipMerchant', 'customer'], axis=1, inplace=True)

# Fill missing values
df['gender'] = df['gender'].replace(['E', 'U'], np.nan)
df['gender'] = df['gender'].map({'M': 1, 'F': 0})
gender_mode = df['gender'].mode()[0] if not df['gender'].mode().empty else 1
df['gender'] = df['gender'].fillna(gender_mode)

for col in ['step', 'age', 'amount']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    median = df[col].median()
    df[col] = df[col].fillna(median)

df['merchant'] = df['merchant'].fillna('unknown')
df['category'] = df['category'].fillna('unknown')

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


### Class Distribution

In [None]:
plt.figure(figsize=(5,3))
sns.countplot(x='fraud', data=df)
plt.title('Class Distribution (fraud)')
plt.show()

## 3. One-hot Encoding

In [None]:
df = pd.get_dummies(df, columns=['merchant', 'category'], drop_first=True)
df.head()

## 4. Train/Test Split

In [None]:
X = df.drop('fraud', axis=1)
y = df['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

## 5. Train Random Forest Model

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

## 6. Feature Importance

In [None]:
importances = rf.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values(by='importance', ascending=False)
importance_df.head(10)

In [None]:
# Plot feature importances
plt.figure(figsize=(10,5))
sns.barplot(x='importance', y='feature', data=importance_df.head(15))
plt.title('Top 15 Feature Importances')
plt.tight_layout()
plt.show()

## 7. Select Important Features and Retrain

In [None]:
# Keep features with >1% importance, or all if all are important
threshold = 0.01
selected_features = importance_df[importance_df['importance'] > threshold]['feature'].tolist()
if len(selected_features) == 0:
    selected_features = feature_names.tolist()
print('Selected features:', selected_features)

X_selected = X[selected_features]
X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(X_selected, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_sel, y_train_sel)
print(classification_report(y_test_sel, model.predict(X_test_sel)))

## 8. Save Model and Feature Columns

In [None]:
os.makedirs('model', exist_ok=True)
with open('model/fraud_model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('model/feature_columns.pkl', 'wb') as f:
    pickle.dump(selected_features, f)
print('Model and feature columns saved!')