<a href="https://colab.research.google.com/github/Utkarshmishra2k2/LoanLens-HMDA-Loan-Data-Analysis-and-Modeling/blob/main/HMDA_Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from pyspark.sql import SparkSession
import joblib
import json
import shap
import shutil
import warnings

In [None]:
warnings.filterwarnings("ignore")

In [None]:
import kagglehub

path = kagglehub.dataset_download("utkarshmishra2k2/hyderabad-metropolitan-development-authority")

print("Path to dataset files:", path)

In [None]:
shutil.copy(f"/root/.cache/kagglehub/datasets/utkarshmishra2k2/hyderabad-metropolitan-development-authority/versions/1/HMDA Data Set For Use.csv", "/content/HMDA Data Set For Use.csv")

In [None]:
spark = SparkSession.builder.appName("Sample CSV Export").getOrCreate()

In [None]:
file_path = "/content/HMDA Data Set For Use.csv"

In [None]:
df = spark.read.csv(file_path, header=True, inferSchema=True)

In [None]:
sample_df = df.sample(fraction=0.20, seed=42)

In [None]:
output_path = "/content"

In [None]:
sample_df.coalesce(1).write.csv(output_path, header=True, mode="overwrite")

In [None]:
print("20% sample saved successfully!")

In [None]:
shutil.copy(f"/content/HMDA.csv/HMDA01.csv", "/content/HMDA01.csv")

In [None]:
df = pd.read_csv("/content/HMDA01.csv")

In [None]:
df.sample(5)

In [None]:
print("DataFrame Shape:", df.shape)

In [None]:
print("\nData Types:\n", df.dtypes)

In [None]:
print("\nDescriptive Statistics:\n", df.describe().T)

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
  print(f"\nFrequency Distribution of '{column}':")
  print(df[column].value_counts().T)

In [None]:
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

In [None]:
print("Columns with Missing Values:")
print(missing_percentage[missing_percentage > 0])

In [None]:
numerical_columns = df.select_dtypes(include=['number']).columns
for column in numerical_columns:
    if df[column].isnull().any():
        df[column].fillna(df[column].median(), inplace=True)

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    if df[column].isnull().any():
        df[column].fillna(df[column].mode()[0], inplace=True)

In [None]:
numerical_features = ['loan_amount_000s', 'applicant_income_000s', 'hud_median_family_income', 'population', 'minority_population']
categorical_features = ['agency_name', 'loan_type_name', 'property_type_name', 'loan_purpose_name', 'owner_occupancy_name']

In [None]:
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=[f'Distribution of {feature}' for feature in numerical_features]
)
for i, feature in enumerate(numerical_features):
    row = i // 3 + 1
    col = i % 3 + 1

    # Histogram
    hist = go.Histogram(
        x=df[feature].dropna(),
        name=f'{feature} Histogram',
        histnorm='probability density',
        opacity=0.75
    )

    # KDE using scipy
    kde = stats.gaussian_kde(df[feature].dropna())
    x_vals = np.linspace(df[feature].min(), df[feature].max(), 1000)
    y_vals = kde(x_vals)
    kde_trace = go.Scatter(
        x=x_vals,
        y=y_vals,
        mode='lines',
        name=f'{feature} KDE',
        line=dict(color='black', dash='dot')
    )

    fig.add_trace(hist, row=row, col=col)
    fig.add_trace(kde_trace, row=row, col=col)

fig.update_layout(
    title_text="Distributions and KDEs of Numerical Features",
    height=800,
    width=1000,
    showlegend=False,
    template="plotly_dark",
    title_x=0.5,
)
fig.show()

# Boxplots for numerical features
fig_box = make_subplots(
    rows=2, cols=3,
    subplot_titles=[f'Boxplot of {feature}' for feature in numerical_features]
)
for i, feature in enumerate(numerical_features):
    row = i // 3 + 1
    col = i % 3 + 1
    box_trace = go.Box(
        y=df[feature].dropna(),
        name=feature,
        boxmean='sd',
        jitter=0.05,
        whiskerwidth=0.5,
        marker=dict(color='lightblue'),
        line=dict(width=1)
    )
    fig_box.add_trace(box_trace, row=row, col=col)

fig_box.update_layout(
    title_text="Boxplots of Numerical Features",
    height=800,
    width=1000,
    showlegend=False,
    template="plotly_dark",
    title_x=0.5,
)
fig_box.show()

In [None]:
corr_matrix = df[numerical_features].corr()
fig_corr = px.imshow(corr_matrix, text_auto=True, aspect="auto",
                     title="Correlation Heatmap of Numerical Features", template="plotly_dark")
fig_corr.show()

In [None]:
target = 'loan_amount_000s'

In [None]:
features = df.columns.drop(target).tolist()

In [None]:
df['loan_income_ratio'] = df['loan_amount_000s'] / df['applicant_income_000s']
df['loan_property_ratio'] = df['loan_amount_000s'] / (df['hud_median_family_income'] / 1000)

In [None]:
numerical_features = ['applicant_income_000s', 'hud_median_family_income', 'population',
                      'minority_population', 'loan_income_ratio', 'loan_property_ratio']
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

In [None]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

In [None]:
X = df.drop(columns=[target])
y = df[target]

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42))
])

In [None]:
model_pipeline.fit(X_train, y_train)

In [None]:
y_pred_val = model_pipeline.predict(X_val)

In [None]:
print("Mean Squared Error (MSE):", mean_squared_error(y_val, y_pred_val))

In [None]:
print("Root Mean Squared Error (RMSE):",  np.sqrt(mean_squared_error(y_val, y_pred_val)))

In [None]:
print("Mean Absolute Error (MAE):", mean_absolute_error(y_val, y_pred_val))

In [None]:
r2 = r2_score(y_val, y_pred_val)
print("R-squared (R2):",r2)

In [None]:
n = X_val.shape[0]
X_train_transformed = preprocessor.fit_transform(X_train)
p = X_train_transformed.shape[1]
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print("Adjusted R-squared:", adjusted_r2)

In [None]:
residuals = y_val - y_pred_val
fig_resid = px.scatter(x=y_pred_val, y=residuals,
                       labels={'x': 'Predicted', 'y': 'Residuals'},
                       title="Residual Plot on Validation Set", template="plotly_dark")
fig_resid.add_hline(y=0, line_dash="dash")
fig_resid.show()

In [None]:
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [10, 20, None]
}

In [None]:
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
best_params = grid_search.best_params_

In [None]:
best_cv_mse = -grid_search.best_score_

In [None]:
print("\nBest Parameters:", best_params)
print("Best Cross-Validation MSE:", best_cv_mse)

In [None]:
metrics_log = {
    "best_params": best_params,
    "best_cv_mse": best_cv_mse,
    "validation_mse": mean_squared_error(y_val, y_pred_val), # Calculate and assign to 'validation_mse'
    "validation_rmse": np.sqrt(mean_squared_error(y_val, y_pred_val)), # Calculate and assign to 'validation_rmse'
    "validation_mae": mean_absolute_error(y_val, y_pred_val), # Calculate and assign to 'validation_mae'
    "validation_r2": r2,
    "validation_adjusted_r2": adjusted_r2
}

In [None]:
with open("model_metrics.json", "w") as f:
    json.dump(metrics_log, f, indent=4)
print("Metrics logged to model_metrics.json")

In [None]:
X_train_trans = preprocessor.fit_transform(X_train)

In [None]:
regressor = model_pipeline.named_steps['regressor']

In [None]:
explainer = shap.TreeExplainer(regressor)

In [None]:
shap_values = explainer.shap_values(X_train_trans)

In [None]:
feature_names_num = numerical_features

In [None]:
feature_names_cat = model_pipeline.named_steps['preprocessor'].transformers_[1][1] \
    .named_steps['onehot'].get_feature_names_out(categorical_features)
feature_names = np.concatenate([feature_names_num, feature_names_cat])

In [None]:
shap_values = explainer.shap_values(X_train_trans.toarray()) # Convert to dense NumPy array

In [None]:
top_feature = feature_names[np.argmax(regressor.feature_importances_)]

In [None]:
shap.dependence_plot(top_feature, shap_values, X_train_trans.toarray(), feature_names=feature_names)
# Convert X_train_trans to a dense array using toarray()

In [None]:
y_pred_test = model_pipeline.predict(X_test)

In [None]:
print("\nTest Metrics:")
print("Test MSE:", mean_squared_error(y_test, y_pred_test))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))
print("Test MAE:", mean_absolute_error(y_test, y_pred_test))
print("Test R2:", r2_score(y_test, y_pred_test))