In [None]:
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import joblib
import os

In [None]:
plt.style.use('ggplot')
sns.set_context("notebook", font_scale=1.2)
pd.set_option('display.max_columns', None)

In [None]:
os.makedirs('../data', exist_ok=True)
os.makedirs('../models', exist_ok=True)

print("DS Environment Ready.")

In [None]:

df = pd.read_csv('../historical_loans.csv')
print(f"Initial Dataset Shape: {df.shape}")
df.head()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='loan_status', data=df, palette='viridis')
plt.title("Target Distribution: Loan Status (0=Repaid, 1=Default)")
plt.show()

In [None]:
default_count = df['loan_status'].sum()
total_count = len(df)
default_rate = (default_count / total_count) * 100

In [None]:
print(f"Total Loans: {total_count}")
print(f"Defaults: {default_count}")
print(f"Default Rate: {default_rate:.2f}%")

if default_rate < 20:
    print(" CONCLUSION: Dataset is Imbalanced. We will use 'scale_pos_weight' in XGBoost.")

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

In [None]:
sns.boxplot(x=df['income'], ax=axes[0, 0], color='skyblue')
axes[0, 0].set_title("Income (Raw) - Note Outliers")

In [None]:
sns.histplot(df['credit_score'], bins=30, kde=True, ax=axes[0, 1], color='orange')
axes[0, 1].set_title("Credit Score (Raw)")

In [None]:
sns.histplot(df['loan_amount'], bins=30, kde=True, ax=axes[1, 0], color='green')
axes[1, 0].set_title("Loan Amount (Raw)")

In [None]:

fig, axes = plt.subplots(2, 2, figsize=(16, 10))
sns.boxplot(x=df['income'], ax=axes[0, 0], color='skyblue')
axes[0, 0].set_title("Income (Raw) - Note Outliers")
sns.histplot(df['credit_score'], bins=30, kde=True, ax=axes[0, 1], color='orange')
axes[0, 1].set_title("Credit Score (Raw)")
sns.histplot(df['loan_amount'], bins=30, kde=True, ax=axes[1, 0], color='green')
axes[1, 0].set_title("Loan Amount (Raw)")
sns.boxplot(x=df['employment_length_years'], ax=axes[1, 1], color='red')
axes[1, 1].set_title("Employment Length (Raw)")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 4))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title("Missing Values Map (Yellow = Missing)")
plt.show()

In [None]:
print("Missing Values Count:")
print(df.isnull().sum())

In [None]:
df_clean = df.copy()

In [None]:
df_clean = df_clean.dropna(subset=['loan_status'])
df_clean = df_clean[df_clean['loan_status'].isin([0, 1])]

In [None]:
df_clean['income'] = df_clean['income'].abs()
df_clean['loan_amount'] = df_clean['loan_amount'].abs()
df_clean['employment_length_years'] = df_clean['employment_length_years'].apply(lambda x: max(0, x) if pd.notnull(x) else x)

In [None]:
median_income = df_clean['income'].median()
df_clean['income'] = df_clean['income'].fillna(median_income)
df_clean['credit_score'] = df_clean['credit_score'].fillna(df_clean['credit_score'].median())
df_clean['loan_amount'] = df_clean['loan_amount'].fillna(df_clean['loan_amount'].median())

In [None]:
df_clean['employment_length_years'] = df_clean['employment_length_years'].fillna(0)

In [None]:
income_cap = df_clean['income'].quantile(0.95) 
df_clean['income'] = np.where(df_clean['income'] > income_cap, income_cap, df_clean['income'])

In [None]:
loan_cap = df_clean['loan_amount'].quantile(0.99)
df_clean['loan_amount'] = np.where(df_clean['loan_amount'] > loan_cap, loan_cap, df_clean['loan_amount'])

In [None]:
df_clean['credit_score'] = df_clean['credit_score'].clip(300, 850)

In [None]:
df_clean['employment_length_years'] = df_clean['employment_length_years'].clip(0, 50)

In [None]:
df_clean = df_clean.drop_duplicates()

print("Cleaning Complete.")

In [None]:
df_clean['loan_to_income'] = df_clean['loan_amount'] / (df_clean['income'] + 1)

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='loan_status', y='loan_to_income', data=df_clean, palette='Set2')
plt.title("Loan-to-Income Ratio vs Default Risk")
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
corr = df_clean.corr()
sns.heatmap(corr, annot=True, cmap='RdYlGn', fmt=".2f")
plt.title("Cleaned Feature Correlation Matrix")
plt.show()

In [None]:
df_clean.to_csv('../data/cleaned_loans.csv', index=False)

In [None]:
joblib.dump(median_income, '../models/median_income.pkl')
joblib.dump(income_cap, '../models/income_cap.pkl') 

print("Files saved successfully in /data and /models.")

In [None]:
# Data Cleaning Pipeline