# 01_EDA: Exploratory Data Analysis

Analyze LendingClub credit data for risk prediction.

In [ ]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('../data/raw/lending_club.csv')

## 1. Target Variable Distribution

In [ ]:
sns.countplot(x='loan_status', data=df[df['loan_status'].isin(['Fully Paid','Charged Off'])])
plt.title('Loan Status Distribution')
plt.show()

## 2. Missing Values

In [ ]:
missing = df.isnull().mean().sort_values(ascending=False)
missing[missing>0].plot(kind='bar', figsize=(10,4))
plt.title('Missing Value Ratio per Feature')
plt.show()

## 3. Feature Correlation Heatmap

In [ ]:
corr = df[['annual_inc','loan_amnt','fico_score']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

## 4. Relationships: Income, Credit Score, Loan Amount

In [ ]:
sns.boxplot(x='loan_status', y='annual_inc', data=df)
plt.title('Annual Income vs Loan Status')
plt.show()

sns.boxplot(x='loan_status', y='fico_score', data=df)
plt.title('FICO Score vs Loan Status')
plt.show()

sns.boxplot(x='loan_status', y='loan_amnt', data=df)
plt.title('Loan Amount vs Loan Status')
plt.show()

## 5. Class Imbalance

In [ ]:
df['loan_status'].value_counts(normalize=True).plot(kind='pie', autopct='%1.1f%%')
plt.title('Class Imbalance')
plt.ylabel('')
plt.show()

## 6. Categorical Features Analysis

In [ ]:
cat_cols = ['purpose','home_ownership']
for col in cat_cols:
    sns.countplot(x=col, hue='loan_status', data=df)
    plt.title(f'{col} vs Loan Status')
    plt.xticks(rotation=45)
    plt.show()