In [33]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import joblib


In [34]:
df = pd.read_csv("../data/loan_data.csv")
df.head()


Unnamed: 0,Text,Income,Credit_Score,Loan_Amount,DTI_Ratio,Employment_Status,Approval
0,I need a loan to pay for an international vaca...,26556,581,8314,79.26,employed,Rejected
1,I want to make home improvements like installi...,197392,389,111604,22.14,employed,Rejected
2,"I need a loan for home renovation, including a...",44561,523,34118,45.44,employed,Rejected
3,I need funds to buy new furniture and applianc...,190363,729,118757,10.22,unemployed,Rejected
4,I need a loan to start a small business.,61853,732,19210,44.13,employed,Approved


In [35]:
# Drop columns not used for loan amount prediction
for col in ["Text", "Approval"]:
    if col in df.columns:
        df = df.drop(col, axis=1)


In [36]:
if "Text" in df.columns:
    df = df.drop("Text", axis=1)


In [37]:
num_cols = df.select_dtypes(include=['int64','float64']).columns
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)


C:\Users\Dell\AppData\Local\Temp\ipykernel_23264\1923754004.py:3: ChainedAssignmentError: A value is being set on a copy of a DataFrame or Series through chained assignment using an inplace method.
Such inplace method never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy (due to Copy-on-Write).

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' instead, to perform the operation inplace on the original object, or try to avoid an inplace operation using 'df[col] = df[col].method(value)'.

See the documentation for a more detailed explanation: https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html
  df[col].fillna(df[col].median(), inplace=True)
C:\Users\Dell\AppData\Local\Temp\ipykernel_23264\1923754004.py:3: ChainedAssignmentError: A value is being set on a copy of a DataFrame or Series through chained assignment usi

In [38]:
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])


In [39]:
X = df.drop("Loan_Amount", axis=1)
y = df["Loan_Amount"]


In [40]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [41]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)


In [42]:
lr = LinearRegression()
lr.fit(X_train, y_train)

lr_pred = lr.predict(X_test)

print("Linear Regression MAE:", mean_absolute_error(y_test, lr_pred))
print("Linear Regression R2:", r2_score(y_test, lr_pred))


Linear Regression MAE: 22041.968715809064
Linear Regression R2: 0.33646615801444835


In [43]:
rf = RandomForestRegressor(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)

print("Random Forest MAE:", mean_absolute_error(y_test, rf_pred))
print("Random Forest R2:", r2_score(y_test, rf_pred))


Random Forest MAE: 22729.332035416664
Random Forest R2: 0.27159191891238177


In [44]:
joblib.dump(rf, "../models/loan_amount_regressor.pkl")
joblib.dump(scaler, "../models/loan_amount_scaler.pkl")

print("Loan amount regression model saved.")


Loan amount regression model saved.


In [45]:
sample = X.iloc[[0]]
sample_scaled = scaler.transform(sample)

pred_amount = rf.predict(sample_scaled)
print("Predicted Loan Amount:", int(pred_amount[0]))


Predicted Loan Amount: 9966
