import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load
df_cc = pd.read_csv('../data/raw/creditcard.csv')
print(df_cc.shape)
print(df_cc.head())

In [None]:
# Log transform Amount
df_cc['log_amount'] = np.log(df_cc['Amount'] + 1)

# Class distribution
print(df_cc['Class'].value_counts(normalize=True))  # 0.17% fraud

fig, axes = plt.subplots(2, 2, figsize=(12,8))
sns.histplot(df_cc['Time'], kde=True, ax=axes[0,0])
axes[0,0].set_title('Time')

sns.histplot(df_cc['Amount'], kde=True, ax=axes[0,1])
axes[0,1].set_title('Amount (Skewed)')

sns.histplot(df_cc['log_amount'], kde=True, ax=axes[1,0])
axes[1,0].set_title('Log Amount')

# Sample V1-V28 (too many; show V1)
sns.histplot(df_cc['V1'], kde=True, ax=axes[1,1])
axes[1,1].set_title('V1 (PCA Feature)')
plt.tight_layout()
plt.show()

In [None]:
# Fraud by Amount
sns.boxplot(data=df_cc, x='Class', y='Amount')
plt.title('Amount by Class')
plt.show()

# Correlations with Class (select top)
corr_cols = [col for col in df_cc.columns if col.startswith('V')]
corr_df = df_cc[corr_cols + ['Class']].corr()['Class'].abs().sort_values(ascending=False)[1:11]
print(corr_df)

sns.heatmap(df_cc[['V14', 'V17', 'V12', 'V10', 'Class']].corr(), annot=True, cmap='coolwarm')
plt.title('Top Correlated Features with Class')
plt.show()

plt.pie(df_cc['Class'].value_counts(), labels=['Non-Fraud', 'Fraud'], autopct='%1.2f%%')
plt.title('Class Imbalance (0.17% Fraud)')
plt.show()

In [None]:
df_cc.to_parquet('../data/processed/creditcard_processed.parquet')