In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", 100)


In [None]:
# Load first ~200k rows safely
chunksize = 50000
chunks = []
for chunk in pd.read_csv("../data/accepted_2007_to_2018.csv", chunksize=chunksize, low_memory=False):
    chunks.append(chunk)
    if sum(len(c) for c in chunks) > 200000:
        break

df = pd.concat(chunks, ignore_index=True)
df.head()


In [None]:
df.info()


In [None]:
df.isnull().mean().sort_values(ascending=False).head(20)


In [None]:
def is_default(x):
    x = str(x).lower()
    if "charged off" in x or "default" in x or "late" in x:
        return 1
    else:
        return 0

df["default"] = df["loan_status"].apply(is_default)
df["default"].value_counts()


In [None]:
sns.countplot(x=df["default"])
plt.title("Fully Paid vs Default Count")
plt.show()


In [None]:
cols = ["loan_amnt", "int_rate", "annual_inc", "dti"]

for c in cols:
    plt.figure(figsize=(6,4))
    sns.histplot(df[c], kde=True)
    plt.title(f"Distribution of {c}")
    plt.show()


In [None]:
grade_default = df.groupby("grade")["default"].mean()
sns.barplot(x=grade_default.index, y=grade_default.values)
plt.title("Default Rate by Grade")
plt.ylabel("Default Rate")
plt.show()


In [None]:
purpose_default = df.groupby("purpose")["default"].mean().sort_values(ascending=False)
plt.figure(figsize=(10,5))
sns.barplot(x=purpose_default.index, y=purpose_default.values)
plt.xticks(rotation=45)
plt.title("Default Rate by Purpose")
plt.show()


In [None]:
numeric_df = df[["loan_amnt", "int_rate", "annual_inc", "dti", "fico_range_low", "fico_range_high", "default"]]
corr = numeric_df.corr()

plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


# EDA Summary

- Dataset is highly imbalanced: majority loans are fully paid.
- Higher loan grades correlate with higher default rates.
- High interest rate, high DTI, and low FICO score correlate to higher default.
- Purpose categories like "small_business" and "renewable_energy" show higher risk.
- Income distribution is heavily skewed; large variance in borrowers.
- Numeric correlation heatmap shows expected relationships:
  - fico_low vs fico_high strongly correlated.
  - dti, annual_inc, loan_amnt moderately informative.
