In [None]:
# Fix: Install missing libraries in the current Jupyter Kernel
%pip install pandas numpy matplotlib seaborn xgboost scikit-learn joblib

# Primary Dataset Analysis: Online Payments Fraud

## Objectives
- Load generic payment data
- specific feature engineering (Balance diff)
- Visualize fraud distribution

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Config
pd.set_option('display.max_columns', None)
# Better style for visualizations
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# Load Data
df = pd.read_csv('../data/onlinefraud.csv')
print(df.shape)
df.head()

In [None]:
# Check imbalance
print(df['isFraud'].value_counts())
print(df['isFraud'].value_counts(normalize=True))

In [None]:
# Improved Transaction Type Analysis
# We filter only for relevant types where fraud actually happens (usually TRANSFER and CASH_OUT)
subset = df[df['type'].isin(['CASH_OUT', 'TRANSFER', 'PAYMENT', 'CASH_IN'])]

plt.figure(figsize=(10, 6))
plt.title('Distribution of Fraud across Transaction Types (Log Scale)', fontsize=16)

# Use specific palette: Normal=Blue, Fraud=Red
sns.countplot(x='type', hue='isFraud', data=subset, palette=['#3b82f6', '#ef4444'])

plt.yscale('log') # Important to see the small Red bars
plt.xlabel("Transaction Type", fontsize=12)
plt.ylabel("Count (Log Scale)", fontsize=12)
plt.legend(title='Status', labels=['Safe', 'Fraud'])
plt.show()

In [None]:
# Insights: Does Fraud happen in Payments?
fraud_by_type = df.groupby('type')['isFraud'].sum()
print("Fraud counts by Type:")
print(fraud_by_type)
# This confirms which types we should focus our Model on