In [None]:
import os
import pandas as pd
import numpy as np 

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import scipy.stats as stats
from scipy.stats import zscore
import psycopg2
import pandas as pd
from psycopg2.extras import execute_values

from sklearn.preprocessing import  OneHotEncoder, RobustScaler
from sklearn.model_selection import train_test_split
import pickle

: 

In [None]:
host = "localhost"  
port = "5432"
dbname = "loan_database"  
user = "myuser" 
password = "mypassword"  

In [None]:
conn = psycopg2.connect(
    host=host,
    port=port,
    dbname=dbname,
    user=user,
    password=password
)

cur = conn.cursor()

cur.execute("SELECT * FROM loan_data;")

results = cur.fetchall()

df = pd.DataFrame(results, columns=[
    "person_age", "person_gender", "person_education", "person_income",
    "person_emp_exp", "person_home_ownership", "loan_amnt", "loan_intent",
    "loan_int_rate", "loan_percent_income", "cb_person_cred_hist_length", 
    "credit_score", "previous_loan_defaults_on_file", "loan_status"
])


In [None]:
df.head()

In [None]:
df = df.dropna()

In [None]:
df["person_age"] = df["person_age"].astype(int)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
for col_name, dtype in df.dtypes.items():
    if dtype == "object":
        print(f"Column Name: {col_name} - Distinct values: {df[col_name].unique()}")


In [None]:
def create_subplots(columns, col_type, num_col = 4):
    num_plots = len(columns)
    num_rows = (num_plots // num_col) + (num_plots % num_col > 0) 

    fig, axes = plt.subplots(num_rows, num_col, figsize=(num_col * 8, num_rows * 5))

    axes = axes.flatten()

    for i, col_name in enumerate(columns):
        if col_type=="number":
            sns.histplot(df[col_name], kde=True, bins=30, ax=axes[i])
            axes[i].set_title(col_name)
        if col_type == "category":
            df[col_name].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, ax=axes[i])
            axes[i].set_title(f"Pie Chart of {col_name}")
            axes[i].set_ylabel("")  # Remove default y-label


    for j in range(i + 1, len(axes)):
        axes[j].axis('off')

    plt.show()

In [None]:
numeric_columns = df.select_dtypes(include=np.number).columns
create_subplots(columns = numeric_columns, col_type = "number", num_col = 3)

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns
create_subplots(columns = categorical_columns, col_type = "category", num_col = 3)

In [None]:
df

In [None]:
numeric_df = df.select_dtypes(include=[float, int])
correlation_matrix = numeric_df.corr()

plt.figure(figsize=(15, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt='.2f', cbar=True)
plt.title('Correlation Matrix')
plt.show()

In [None]:
df = df.drop(["person_age", "person_emp_exp"], axis=1)

In [None]:
numeric_columns = [col for col in numeric_columns if col not in ["person_age", "person_emp_exp"]]

In [None]:

X = df.drop('loan_status', axis=1)
y = df['loan_status']  

In [None]:
numeric_columns = [col for col in numeric_columns if col not in ["loan_status"]]

In [None]:
train_columns = X.columns.to_list()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
column_list = df.columns.tolist()  
column_list_str = ', '.join(column_list)  

create_table_query = f"""
CREATE TABLE IF NOT EXISTS training_data (
    person_gender VARCHAR(50),
    person_education VARCHAR(50),
    person_income FLOAT,
    person_home_ownership VARCHAR(50),
    loan_amnt FLOAT,  
    loan_intent VARCHAR(50),
    loan_int_rate FLOAT,  
    loan_percent_income FLOAT,
    cb_person_cred_hist_length FLOAT,
    credit_score INT,
    previous_loan_defaults_on_file VARCHAR(10),
    loan_status FLOAT,
    CONSTRAINT unique_training_columns UNIQUE (person_gender, person_education, person_income, person_home_ownership, 
        loan_amnt, loan_intent, loan_int_rate, loan_percent_income, cb_person_cred_hist_length, 
        credit_score, previous_loan_defaults_on_file) 
);
"""
cur.execute(create_table_query)

train_data = X_train.copy() 
train_data['loan_status'] = y_train  

data_to_insert = train_data.values.tolist()

insert_query = f"""
    INSERT INTO training_data ({column_list_str})
    VALUES %s
    ON CONFLICT ON CONSTRAINT unique_training_columns 
    DO NOTHING; 
"""

execute_values(cur, insert_query, data_to_insert)

conn.commit()
cur.close()
conn.close()

In [None]:
test_data = X_test.copy()
test_data['loan_status'] = y_test
test_data.to_csv('../data/test_data.csv', index=False)

In [None]:
scaler = RobustScaler(quantile_range=(20, 80))

X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

encoder = OneHotEncoder(sparse_output=False, drop='first')

encoded_train_categories = encoder.fit_transform(X_train[categorical_columns])
encoded_test_categories = encoder.transform(X_test[categorical_columns])

encoded_train_df = pd.DataFrame(encoded_train_categories, columns=encoder.get_feature_names_out(categorical_columns))
encoded_test_df = pd.DataFrame(encoded_test_categories, columns=encoder.get_feature_names_out(categorical_columns))

X_train_encoded = pd.concat([X_train[numeric_columns].reset_index(drop=True), encoded_train_df.reset_index(drop=True)], axis=1)
X_test_encoded = pd.concat([X_test[numeric_columns].reset_index(drop=True), encoded_test_df.reset_index(drop=True)], axis=1)

X_train_encoded = X_train_encoded.values
X_test_encoded = X_test_encoded.values

y_train = np.array(y_train)
y_test = np.array(y_test)

print(f"X_train_encoded shape: {X_train_encoded.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test_encoded shape: {X_test_encoded.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier, KerasRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import numpy as np
import mlflow
import mlflow.keras

In [None]:
z_scores = np.abs(zscore(X_train_encoded))

threshold = 3

mask = (z_scores < threshold).all(axis=1)  
X_train_encoded = X_train_encoded[mask]
y_train = y_train[mask]

In [None]:
def build_model():
    model = Sequential([
        Dense(64, activation="relu", input_shape=(X_train_encoded.shape[1],)),  
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid")  
    ])

    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
estimator = KerasClassifier(build_fn=build_model, epochs=10, batch_size=10, verbose=0)
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = cross_val_score(estimator, X_train_encoded, y_train, cv=stratified_kfold, n_jobs=2)
print(f"Cross-Validation Mean Accuracy: {cv_results.mean():.4f} ± {cv_results.std():.4f}")

In [None]:
def calculate_psi(expected, observed, buckets=10):
    """
    Calculate PSI between two distributions.
    """
    expected_bins = pd.cut(expected, bins=buckets, include_lowest=True)
    observed_bins = pd.cut(observed, bins=expected_bins.categories, include_lowest=True)
    
    expected_counts = expected_bins.value_counts()
    observed_counts = observed_bins.value_counts()
    
    expected_pct = expected_counts / len(expected)
    observed_pct = observed_counts / len(observed)
    
    all_bins = expected_bins.categories
    expected_pct = expected_pct.reindex(all_bins, fill_value=0)
    observed_pct = observed_pct.reindex(all_bins, fill_value=0)

    psi_value = np.sum((observed_pct - expected_pct) * np.log(observed_pct / expected_pct))
    
    return psi_value

In [None]:
def calculate_psi(train_data, test_data, bins=10):
    min_val = min(np.min(train_data), np.min(test_data))
    max_val = max(np.max(train_data), np.max(test_data))
    bin_edges = np.linspace(min_val, max_val, bins+1)

    train_counts, _ = np.histogram(train_data, bins=bin_edges)
    test_counts, _ = np.histogram(test_data, bins=bin_edges)

    train_proportions = train_counts / len(train_data)
    test_proportions = test_counts / len(test_data)

    epsilon = 1e-8
    psi = np.sum((train_proportions - test_proportions) * np.log((train_proportions + epsilon) / (test_proportions + epsilon)))
    
    return psi

In [None]:
def calculate_density_stats(data):
    mean = np.mean(data)
    std_dev = np.std(data)
    skewness = stats.skew(data)
    kurtosis = stats.kurtosis(data)
    
    return mean, std_dev, skewness, kurtosis

In [None]:
dist_analysis_dict = {}

for i in range(len(train_columns)):
    train_data = X_train_encoded[:, i]
    test_data = X_test_encoded[:, i]
    
    psi = calculate_psi(train_data, test_data)
    mean_train, std_dev_train, skewness_train, kurtosis_train = calculate_density_stats(train_data)
    mean_test, std_dev_test, skewness_test, kurtosis_test = calculate_density_stats(test_data)
    
    feature_key = f"{train_columns[i]}"
    dist_analysis_dict[feature_key] = {
        'PSI': psi,
        'Train_Mean': mean_train,
        'Train_Std_Dev': std_dev_train,
        'Train_Skewness': skewness_train,
        'Train_Kurtosis': kurtosis_train,
        'Test_Mean': mean_test,
        'Test_Std_Dev': std_dev_test,
        'Test_Skewness': skewness_test,
        'Test_Kurtosis': kurtosis_test
    }

print("\nPSI Values Between Training and Test Data:")
for feature, stats in dist_analysis_dict.items():
    psi = stats['PSI']
    message = "No significant shift" if psi < 0.1 else "Minor shift" if psi < 0.25 else "Significant shift"
    
    print(f"{feature}: PSI = {psi:.4f} -> {message}")
    print(f"  Train Stats -> Mean: {stats['Train_Mean']:.4f}, Std Dev: {stats['Train_Std_Dev']:.4f}, "
          f"Skewness: {stats['Train_Skewness']:.4f}, Kurtosis: {stats['Train_Kurtosis']:.4f}")
    print(f"  Test Stats  -> Mean: {stats['Test_Mean']:.4f}, Std Dev: {stats['Test_Std_Dev']:.4f}, "
          f"Skewness: {stats['Test_Skewness']:.4f}, Kurtosis: {stats['Test_Kurtosis']:.4f}")
    print("-" * 50)


In [None]:
final_model = build_model()

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.0001)
final_model.fit(X_train_encoded, y_train, epochs=20, batch_size=132, validation_split=0.2, callbacks=[reduce_lr])

y_train_pred_prob = final_model.predict(X_train_encoded)
y_train_pred = (y_train_pred_prob > 0.5).astype(int)  

accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)

print("\nModel Evaluation on Training Set:")
print(f"Accuracy:  {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"Recall:    {recall_train:.4f}")
print(f"F1-score:  {f1_train:.4f}")

In [None]:
if not os.path.exists("../fastapi/model/"):
    os.makedirs("../fastapi/model/")

In [None]:
mlflow.set_experiment("Loan Prediction Retraining")

# Start an MLflow run
with mlflow.start_run():
    mlflow.keras.log_model(final_model, "model")
        
    result = mlflow.register_model("runs:/{}/model".format(mlflow.active_run().info.run_id), "LoanPredictionModel")
        
    mlflow.log_metric("accuracy", accuracy_train)
    mlflow.log_metric("precision", precision_train)
    mlflow.log_metric("recall", recall_train)
    mlflow.log_metric("f1_score", f1_train)
    mlflow.log_artifact("model/scaler.pkl")
    mlflow.log_artifact("model/encoder.pkl")

In [None]:
with open('../fastapi/model/scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

with open('../fastapi/model/encoder.pkl', 'wb') as encoder_file:
    pickle.dump(encoder, encoder_file)

final_model.save("../fastapi/model/model.h5")

print("Model files are saved.")