In [1]:
# importing all the models and libs required for the preicidation
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


In [2]:
# view of the data set 
df = pd.read_csv("loan_data.csv")
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [3]:
df.shape


(45000, 14)

In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  float64
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  float64
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  

In [5]:
df.isnull().sum()


person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
loan_status                       0
dtype: int64

In [6]:
# Basic data cleaning 
#Drop classification column (not required for regression)
df = df.drop(columns=['loan_status'])

# Fill missing numeric values with median
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Fill missing categorical values with mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

df.isnull().sum()


person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
dtype: int64

In [7]:
X = df.drop(columns=['loan_amnt'])  # Features
y = df['loan_amnt']                # Target

In [8]:
# creating pipelines for transformer 
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())]) # numeric transformer

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))]) # categorical transformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [9]:
# splitting of the data for model traning and evalution 
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)


In [10]:
# dictionry creation for all the model making code for implemantation
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42) }


In [11]:
results = []# for storing the  results
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model) ]) # pipeline for the model 
    
    pipeline.fit(X_train, y_train) # fitting of test data
    y_pred = pipeline.predict(X_test) # pipeline for prediction
    
    r2 = r2_score(y_test, y_pred) # R2 score 
    mae = mean_absolute_error(y_test, y_pred) #mae error 
    rmse = np.sqrt(mean_squared_error(y_test, y_pred)) # rsme score
    
    results.append([name, r2, mae, rmse])

results_df = pd.DataFrame(
    results, columns=['Model', 'R2 Score', 'MAE', 'RMSE'])
results_df.sort_values(by='R2 Score', ascending=False)


Unnamed: 0,Model,R2 Score,MAE,RMSE
4,Random Forest,0.99824,145.209102,266.192709
3,Decision Tree,0.996415,174.588,379.845851
5,Gradient Boosting,0.994109,331.146914,486.938781
2,Lasso Regression,0.59198,2871.984678,4052.556286
0,Linear Regression,0.591979,2871.991942,4052.563378
1,Ridge Regression,0.591978,2871.999178,4052.567834


In [12]:
best_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(
        n_estimators=100, random_state=42, n_jobs=-1))
])

best_model.fit(X_train, y_train)


In [13]:
# Sample real-time input
sample_input = pd.DataFrame([{
    'person_age': 30,
    'person_gender': 'male',
    'person_education': 'Bachelor',
    'person_income': 75000,
    'person_emp_exp': 5,
    'person_home_ownership': 'RENT',
    'loan_intent': 'EDUCATION',
    'loan_int_rate': 12.5,
    'loan_percent_income': 0.25,
    'cb_person_cred_hist_length': 6,
    'credit_score': 690,
    'previous_loan_defaults_on_file': 'No'
}])

prediction = best_model.predict(sample_input)
print("Predicted Loan Approval Amount:", round(prediction[0], 2))


Predicted Loan Approval Amount: 18393.38
