import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.preprocess import clean_fraud_data, merge_geolocation, ip_to_int

# Load data
df = pd.read_csv('../data/raw/Fraud_Data.csv')
ip_df = pd.read_csv('../data/raw/IpAddress_to_Country.csv')  # Assume loaded

print(df.shape)
print(df.head())

In [1]:
# Clean the data using the preprocess module
df_clean = clean_fraud_data(df.copy())

# Quick checks post-cleaning
print(f"After cleaning: {df_clean.shape}")
print("\nData types:")
print(df_clean.dtypes)
print("\nClass distribution (fraud rate):")
print(df_clean['class'].value_counts(normalize=True))
print("\nMissing values summary:")
print(df_clean.isnull().sum().sum())  

NameError: name 'clean_fraud_data' is not defined

In [None]:
df_clean = clean_fraud_data(df.copy())
print(f"After cleaning: {df_clean.shape}")
print(df_clean.info())
print(df_clean['class'].value_counts(normalize=True))  # Class imbalance

# Distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
sns.histplot(df_clean['purchase_value'], kde=True, ax=axes[0,0])
axes[0,0].set_title('Purchase Value Distribution')

sns.histplot(df_clean['age'], kde=True, ax=axes[0,1])
axes[0,1].set_title('Age Distribution')

df_clean['source'].value_counts().plot(kind='bar', ax=axes[1,0])
axes[1,0].set_title('Source')

df_clean['browser'].value_counts().plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('Browser')
plt.tight_layout()
plt.show()

In [None]:
# Fraud by source
sns.countplot(data=df_clean, x='source', hue='class')
plt.title('Fraud by Source')
plt.show()

# Age vs Class
sns.boxplot(data=df_clean, x='class', y='age')
plt.title('Age by Class')
plt.show()

# Correlation
corr = df_clean[['purchase_value', 'age', 'class']].corr()
sns.heatmap(corr, annot=True)
plt.title('Correlation Heatmap')
plt.show()

# Imbalance viz
plt.figure(figsize=(6,4))
df_clean['class'].value_counts().plot(kind='bar')
plt.title('Class Distribution (Imbalance: 5.4% Fraud)')
plt.ylabel('Count')
plt.show()

In [None]:
df_geo = merge_geolocation(df_clean, ip_df)
print(f"Merged shape: {df_geo.shape}")
print(df_geo['country_code'].value_counts().head())

df_geo.to_parquet('../data/processed/fraud_geo.parquet')

# Fraud rate by country
country_fraud = df_geo.groupby('country_code')['class'].agg(['mean', 'count'])
country_fraud['mean'] *= 100  # Percentage
top_fraud_countries = country_fraud.sort_values('mean', ascending=False).head(10)

sns.barplot(data=top_fraud_countries.reset_index(), x='country_code', y='mean')
plt.title('Top 10 Fraud Rates by Country (%)')
plt.xticks(rotation=45)
plt.show()

print("High-risk countries:", top_fraud_countries.index.tolist())