In [None]:
# Fix: Install missing libraries in the current Jupyter Kernel
%pip install pandas numpy matplotlib seaborn xgboost scikit-learn joblib

# Secondary Dataset Analysis: Credit Card Fraud (Sparkov)

## Objectives
- Load Credit Card Transaction data
- Geospatial Analysis (Lat/Long)
- Demographic Analysis (Age/Job)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import radians, cos, sin, asin, sqrt

# Config
pd.set_option('display.max_columns', None)
sns.set_theme(style="whitegrid") # Cleaner theme
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# Load Data (Using Test set for Analysis as it is smaller, or Train if preferred)
df = pd.read_csv('../data/fraudTest.csv')

# Sample for faster plotting if needed (e.g., 100k rows)
# df = df.sample(100000, random_state=42)
print(df.shape)
df.head()

In [None]:
# Geospatial Feature Engineering
def haversine(lon1, lat1, lon2, lat2):
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

df['dist_to_merch'] = df.apply(lambda x: haversine(x['long'], x['lat'], x['merch_long'], x['merch_lat']), axis=1)

plt.figure(figsize=(10, 6))
# showfliers=False removes the extreme outliers to make the box/median visible
sns.boxplot(x='is_fraud', y='dist_to_merch', data=df, showfliers=False, palette=['#3b82f6', '#ef4444'])
plt.title('Distance to Merchant (Zoomed - No Outliers)', fontsize=16)
plt.ylabel('Distance (km)')
plt.xticks([0, 1], ['Normal', 'Fraud'])
plt.show()

In [None]:
# Age Analysis
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = 2020 - df['dob'].dt.year  # Approx age in 2020

plt.figure(figsize=(12, 6))
# KDE + Hist, common_norm=False allows comparing shapes independently of class size
sns.histplot(data=df, x='age', hue='is_fraud', common_norm=False, kde=True, 
             palette=['#3b82f6', '#ef4444'], element="step", alpha=0.3)
             
plt.title('Age Distribution: Fraud vs Normal Transaction', fontsize=16)
plt.xlabel('Age')
plt.show()