# Exploratory Data Analysis

This notebook explores the IEEE‑CIS Fraud dataset, looking at data quality, missingness, distributions, and fraud patterns.

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

%matplotlib inline
sns.set_style("whitegrid")

BASE = Path(__file__).resolve().parents[2]
RAW_TX = BASE / "data" / "raw" / "train_transaction.csv"
RAW_ID = BASE / "data" / "raw" / "train_identity.csv"

print("Loading raw data ...")
tx = pd.read_csv(RAW_TX)
id_ = pd.read_csv(RAW_ID)
print(tx.shape, id_.shape)

df = tx.merge(id_, how='left', on='TransactionID')
print("Merged:", df.shape)
df.head()


NameError: name '__file__' is not defined

In [None]:
# Basic statistics
print("Fraud rate:", df['isFraud'].mean())
df['TransactionAmt'].describe()


In [None]:
# Missingness heatmap for first 40 features
missing = df.isna().mean().sort_values(ascending=False)
missing.head(20)


In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df['TransactionAmt'], bins=100, log_scale=(False, True))
plt.title("Transaction Amount distribution (log Y)")
plt.show()


In [None]:
# Fraud rate by ProductCD
prod_rates = df.groupby('ProductCD')['isFraud'].mean().sort_values(ascending=False)
prod_rates.plot(kind='bar', figsize=(6,3), title='Fraud rate by Product')
plt.xlabel('ProductCD'); plt.ylabel('Fraud rate')
