In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import VotingRegressor



sns.set_theme(style="whitegrid")
pd.set_option("display.max_columns", None)
pd.option_context('mode.use_inf_as_na', True)

<pandas._config.config.option_context at 0x150ad2850>

In [41]:
df = pd.read_csv('Loan.csv')
df.drop('ApplicationDate', inplace=True, axis=1)

In [42]:
df.columns

Index(['Age', 'AnnualIncome', 'CreditScore', 'EmploymentStatus',
       'EducationLevel', 'Experience', 'LoanAmount', 'LoanDuration',
       'MaritalStatus', 'NumberOfDependents', 'HomeOwnershipStatus',
       'MonthlyDebtPayments', 'CreditCardUtilizationRate',
       'NumberOfOpenCreditLines', 'NumberOfCreditInquiries',
       'DebtToIncomeRatio', 'BankruptcyHistory', 'LoanPurpose',
       'PreviousLoanDefaults', 'PaymentHistory', 'LengthOfCreditHistory',
       'SavingsAccountBalance', 'CheckingAccountBalance', 'TotalAssets',
       'TotalLiabilities', 'MonthlyIncome', 'UtilityBillsPaymentHistory',
       'JobTenure', 'NetWorth', 'BaseInterestRate', 'InterestRate',
       'MonthlyLoanPayment', 'TotalDebtToIncomeRatio', 'LoanApproved',
       'RiskScore'],
      dtype='object')

In [43]:
df.drop('LoanApproved',axis=1, inplace=True)

In [44]:
num_cols = df.select_dtypes(include=['int', 'float']).columns.difference(['RiskScore'])
cat_cols = df.select_dtypes(include='object').columns

In [45]:
for col in cat_cols:
    print(df[col].value_counts())
    print('-'*10)

EmploymentStatus
Employed         17036
Self-Employed     1573
Unemployed        1391
Name: count, dtype: int64
----------
EducationLevel
Bachelor       6054
High School    5908
Associate      4034
Master         3050
Doctorate       954
Name: count, dtype: int64
----------
MaritalStatus
Married     10041
Single       6078
Divorced     2882
Widowed       999
Name: count, dtype: int64
----------
HomeOwnershipStatus
Mortgage    7939
Rent        6087
Own         3938
Other       2036
Name: count, dtype: int64
----------
LoanPurpose
Home                  5925
Debt Consolidation    5027
Auto                  4034
Education             3008
Other                 2006
Name: count, dtype: int64
----------


In [46]:
Q1 = df[num_cols].quantile(0.15)
Q3 = df[num_cols].quantile(0.85)
IQR = Q3 - Q1

for col in num_cols:
    df = df[~((df[col] < (Q1[col] - 1.5 * IQR[col])) | (df[col] > (Q3[col] + 1.5 * IQR[col])))]

In [47]:
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  
])

cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')) 
])

In [48]:
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  
])

cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')) 
])

In [49]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ]
)

In [50]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  
    ('pca', PCA(n_components=15))   
])

In [51]:
X = df.drop('RiskScore', axis=1)
y = df['RiskScore'] 

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
pipeline.fit(X_train)

In [54]:
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

print(X_train_transformed.shape) 

(11366, 15)


In [55]:
voting_regressor = VotingRegressor(estimators=[
    ('rf', RandomForestRegressor(random_state=42)),
    ('svr', SVR()),
    ('xgb', XGBRegressor(random_state=42)),
    ('hgb', HistGradientBoostingRegressor(random_state=42))
])

In [56]:
voting_regressor.fit(X_train_transformed, y_train)
y_pred_voting = voting_regressor.predict(X_test_transformed)
mse_voting = mean_squared_error(y_test, y_pred_voting)

print(f"Voting Regressor MSE: {mse_voting}")

Voting Regressor MSE: 10.801717141417175


In [57]:
import joblib

joblib.dump(pipeline, 'preprocessing_pipeline.pkl')

joblib.dump(voting_regressor, 'voting_regressor_model.pkl')

['voting_regressor_model.pkl']