import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load
df_cc = pd.read_csv('../data/raw/creditcard.csv')
print(df_cc.shape)
print(df_cc.head())

In [None]:
# Log transform Amount.
df_cc['log_amount'] = np.log(df_cc['Amount'] + 1)

# Class distribution
print(df_cc['Class'].value_counts(normalize=True))  # 0.17% fraud

fig, axes = plt.subplots(2, 2, figsize=(12,8))
sns.histplot(df_cc['Time'], kde=True, ax=axes[0,0])
axes[0,0].set_title('Time')

sns.histplot(df_cc['Amount'], kde=True, ax=axes[0,1])
axes[0,1].set_title('Amount (Skewed)')

sns.histplot(df_cc['log_amount'], kde=True, ax=axes[1,0])
axes[1,0].set_title('Log Amount')

# Sample V1-V28 (too many; show V1)
sns.histplot(df_cc['V1'], kde=True, ax=axes[1,1])
axes[1,1].set_title('V1 (PCA Feature)')
plt.tight_layout()
plt.show()

In [None]:
# Fraud by Amount
sns.boxplot(data=df_cc, x='Class', y='Amount')
plt.title('Amount by Class')
plt.show()

# Correlations with Class (select top)
corr_cols = [col for col in df_cc.columns if col.startswith('V')]
corr_df = df_cc[corr_cols + ['Class']].corr()['Class'].abs().sort_values(ascending=False)[1:11]
print(corr_df)

sns.heatmap(df_cc[['V14', 'V17', 'V12', 'V10', 'Class']].corr(), annot=True, cmap='coolwarm')
plt.title('Top Correlated Features with Class')
plt.show()

plt.pie(df_cc['Class'].value_counts(), labels=['Non-Fraud', 'Fraud'], autopct='%1.2f%%')
plt.title('Class Imbalance (0.17% Fraud)')
plt.show()

In [None]:
df_cc.to_parquet('../data/processed/creditcard_processed.parquet')

# EDA: Credit Card Fraud Dataset
Analyzing anonymized bank transactions for fraud patterns. Focus: Imbalance, Amount skew, PCA feature correlations.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('default')
sns.set_palette("husl")

# Load data
df_cc = pd.read_csv('../data/raw/creditcard.csv')
print(f"Dataset shape: {df_cc.shape}")
print(df_cc.head())
print("\nTarget distribution:")
print(df_cc['Class'].value_counts(normalize=True))

In [None]:
# No misses/dups, but transform skew
df_cc['log_amount'] = np.log1p(df_cc['Amount'])  # log(1 + x) for zeros

# Time to hours (cyclic later)
df_cc['time_hours'] = df_cc['Time'] / 3600 % 24  # 0-24 hours

print("Transformed columns added: log_amount, time_hours")
print(f"Amount skew before: {df_cc['Amount'].skew():.2f}, after log: {df_cc['log_amount'].skew():.2f}")
print("No missing values:", df_cc.isnull().sum().sum() == 0)

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Univariate Distributions (Credit Card Data)', fontsize=16)

# Time
sns.histplot(df_cc['Time'], bins=50, kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Time (Seconds from Start)')

# Amount (raw)
sns.histplot(df_cc['Amount'], bins=50, kde=True, ax=axes[0, 1])
axes[0, 1].set_title('Transaction Amount (Skewed)')

# Log Amount
sns.histplot(df_cc['log_amount'], bins=50, kde=True, ax=axes[0, 2])
axes[0, 2].set_title('Log(Amount + 1)')

# Sample PCA features (V1, V3 for variety)
sns.histplot(df_cc['V1'], bins=50, kde=True, ax=axes[1, 0])
axes[1, 0].set_title('V1 (PCA Feature)')

sns.histplot(df_cc['V3'], bins=50, kde=True, ax=axes[1, 1])
axes[1, 1].set_title('V3 (PCA Feature)')

# Class (pie for imbalance)
class_pie = df_cc['Class'].value_counts()
axes[1, 2].pie(class_pie.values, labels=['Non-Fraud', 'Fraud'], autopct='%1.1f%%', colors=['lightblue', 'salmon'])
axes[1, 2].set_title('Class Imbalance')

plt.tight_layout()
plt.show()

# Bivariate Analysis: Relationships with Fraud Class
Examining how features correlate with 'Class' (fraud). Expect subtle patterns in PCA vars; fraud often lower Amount but higher variance.

In [None]:
# Compute correlations with Class (focus on numerics)
corr_with_class = df_cc.corr()['Class'].abs().sort_values(ascending=False)
top_features = corr_with_class.head(11).index.tolist()  # Top 10 + Class
print("Top 10 features by absolute correlation with Class:")
print(corr_with_class.head(11))

# Heatmap of top features
plt.figure(figsize=(10, 8))
sns.heatmap(df_cc[top_features].corr(), annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Heatmap: Top Features with Class')
plt.tight_layout()
plt.show()

# Fraud vs. Amount (boxplot)
plt.figure(figsize=(8, 5))
sns.boxplot(data=df_cc, x='Class', y='Amount')
plt.title('Transaction Amount by Class\n(Fraud: Higher Variance, Not Mean)')
plt.yscale('log')  # Log for visibility
plt.show()

# Fraud vs. Time (hours, for patterns)
plt.figure(figsize=(10, 4))
sns.boxplot(data=df_cc, x='Class', y='time_hours')
plt.title('Transaction Hour by Class')
plt.xlabel('Class (0=Legit, 1=Fraud)')
plt.ylabel('Hour of Day')
plt.show()

# Extreme Imbalance Confirmation
0.17% fraud means accuracy ~99.8% baseline—must use PR-AUC/F1 for eval.

In [None]:
# Detailed imbalance plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Bar count
class_counts = df_cc['Class'].value_counts()
ax1.bar(['Non-Fraud (0)', 'Fraud (1)'], class_counts.values, color=['lightblue', 'salmon'])
ax1.set_title('Class Counts')
ax1.set_ylabel('Transactions')
for i, v in enumerate(class_counts.values):
    ax1.text(i, v + 500, f'{v:,}', ha='center', fontweight='bold')

# Pie
ax2.pie(class_counts.values, labels=class_counts.index.astype(str), autopct='%1.2f%%', 
        colors=['lightblue', 'salmon'], startangle=90)
ax2.set_title('Class Proportion')

plt.suptitle('Extreme Imbalance: 0.17% Fraud (492 cases)', fontsize=14)
plt.tight_layout()
plt.show()

# Dummy baseline AUC-PR (majority class predictor)
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_recall_curve, auc
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(np.zeros((len(df_cc), 1)), df_cc['Class'])  # Dummy fit
dummy_pred = dummy.predict(np.zeros((len(df_cc), 1)))
pr_auc_dummy = auc(*precision_recall_curve(df_cc['Class'], dummy_pred)[0:2])
print(f"Baseline AUC-PR (predict all non-fraud): {pr_auc_dummy:.3f} (very low—need better models)")

Save Processed Data

In [None]:
# Save transformed dataset (add log_amount, time_hours; drop original Time if desired)
df_cc_processed = df_cc.drop('Time', axis=1).copy()  # Keep Amount for now, drop Time
df_cc_processed.to_parquet('../data/processed/creditcard_processed.parquet', index=False)
print(f"Saved: creditcard_processed.parquet (shape: {df_cc_processed.shape})")
print("Ready for feature engineering (e.g., cyclic time, subsample for modeling).")