We start by importing essential libraries for data manipulation, visualization, and machine learning.



# Regression Tasks
**Goal: "Predict loan amounts based on borrower details"**

**Use Case**: Helps financial institutions determine optimal loan pricing and lending limits.

**Target Variable:**
loan_amnt (Loan Amount).

**Models**:
Linear Regression, Decision Trees, Random Forests, XGBoost, or Neural Networks.


**Import Required Libraries **

In [2]:
# we stat by importing essential for data manipulation, visualization, and machine learning.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


*load the dataset*

In [3]:
loan_data = pd.read_csv("/content/loan_data.csv")
loan_data.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [4]:
loan_data.shape

(45000, 14)

*Data Preprocessing*

In [5]:
loan_data.columns.value_counts()

Unnamed: 0,count
person_age,1
person_gender,1
person_education,1
person_income,1
person_emp_exp,1
person_home_ownership,1
loan_amnt,1
loan_intent,1
loan_int_rate,1
loan_percent_income,1


In [6]:
loan_data.isnull().sum()

Unnamed: 0,0
person_age,0
person_gender,0
person_education,0
person_income,0
person_emp_exp,0
person_home_ownership,0
loan_amnt,0
loan_intent,0
loan_int_rate,0
loan_percent_income,0


In [7]:
#statistical measures
loan_data.describe()

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status
count,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0
mean,27.764178,80319.05,5.410333,9583.157556,11.006606,0.139725,5.867489,632.608756,0.222222
std,6.045108,80422.5,6.063532,6314.886691,2.978808,0.087212,3.879702,50.435865,0.415744
min,20.0,8000.0,0.0,500.0,5.42,0.0,2.0,390.0,0.0
25%,24.0,47204.0,1.0,5000.0,8.59,0.07,3.0,601.0,0.0
50%,26.0,67048.0,4.0,8000.0,11.01,0.12,4.0,640.0,0.0
75%,30.0,95789.25,8.0,12237.25,12.99,0.19,8.0,670.0,0.0
max,144.0,7200766.0,125.0,35000.0,20.0,0.66,30.0,850.0,1.0


In [8]:
# Import pandas
import pandas as pd

# Identify categorical variables
categorical_variables = loan_data.select_dtypes(include=['object', 'category']).columns.tolist()

# Identify numerical variables
numerical_variables = loan_data.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Categorical Variables:", categorical_variables)
print("Numerical Variables:", numerical_variables)


Categorical Variables: ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file']
Numerical Variables: ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score', 'loan_status']


In [9]:
# Encode Categorical Columns (if not already encoded).

# convert categorical data into Numerical data before scaling
loan_data.replace({
                    'person_gender':{'female':0, 'male':1},
                    'person_education':{'High School':0, 'Bachelor':1,'Associate':2,'Master':3, 'Doctorate':4},
                    'person_home_ownership':{'RENT':0, 'MORTGAGE': 1, 'OWN':2, 'OTHER':3},
                    'loan_intent':{'No':0, 'Yes':1},
                    'previous_loan_defaults_on_file': {'No':0, 'Yes':1},
                    'loan_intent': {'EDUCATION':0,'MEDICAL':1, 'VENTURE':2, 'PERSONAL':3, 'DEBTCONSOLIDATION': 4,
                                        'HOMEIMPROVEMENT': 5 }},inplace=True)


  loan_data.replace({


In [10]:
 loan_data['loan_intent'].value_counts()

Unnamed: 0_level_0,count
loan_intent,Unnamed: 1_level_1
0,9153
1,8548
2,7819
3,7552
4,7145
5,4783


In [11]:
loan_data['person_education'].value_counts()

Unnamed: 0_level_0,count
person_education,Unnamed: 1_level_1
1,13399
2,12028
0,11972
3,6980
4,621


In [12]:
# split Data into Features (X) and Target (y).

# Define the target variable (predict loan_int_rate )

target_column = 'loan_amnt'  # Target  # Change to 'loan_amnt' to predict loan amounts

feature_columns = ['person_age', 'person_gender', 'person_education', 'person_income', 'person_emp_exp',
                   'person_home_ownership', 'loan_intent','loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
                   'credit_score', 'previous_loan_defaults_on_file']  # feature

X = loan_data[feature_columns]# Features
y = loan_data[target_column]  # Target

print("feature variables:" , X.shape)
print("Target variables: ", y.shape)

# Scale the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

feature variables: (45000, 12)
Target variables:  (45000,)


In [13]:
# Standardize Numerical Data.

# Convert back to DataFrame
X = pd.DataFrame(X_scaled, columns=X.columns)
X

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file
0,-0.953538,-1.110069,1.533652,-0.104090,-0.892284,-0.875636,0.480527,1.683039,4.016394,-0.739109,-1.419814,-1.016040
1,-1.118963,-1.110069,-1.259660,-0.846005,-0.892284,2.309524,-1.338478,0.044782,-0.684829,-0.996863,-2.549975,0.984213
2,-0.457264,-1.110069,-1.259660,-0.844065,-0.397517,0.716944,-0.732143,0.625557,3.443074,-0.739109,0.047412,-1.016040
3,-0.788113,-1.110069,-0.328556,-0.007039,-0.892284,-0.875636,-0.732143,1.417829,3.443074,-0.996863,0.840507,-1.016040
4,-0.622689,0.900845,1.533652,-0.176371,-0.727362,-0.875636,-0.732143,1.095549,4.475050,-0.481354,-0.924130,-1.016040
...,...,...,...,...,...,...,...,...,...,...,...,...
44995,-0.126414,0.900845,0.602548,-0.402231,0.097249,-0.875636,-0.732143,1.562184,1.952442,-0.739109,0.245686,-1.016040
44996,1.527834,-1.110069,0.602548,-0.180537,1.911393,-0.875636,1.693197,1.028407,0.003155,1.322928,-0.230171,-1.016040
44997,0.866135,0.900845,0.602548,-0.290681,0.262171,-0.875636,1.086862,-0.331212,-1.028821,1.065174,0.701716,-1.016040
44998,0.204436,0.900845,-0.328556,-0.586348,-0.232595,-0.875636,-1.338478,0.746412,2.525762,0.034155,-0.567237,-1.016040


In [14]:
print("Feature variable shape:" ,  X.shape)
print("Target variable shape:" , y.shape)

Feature variable shape: (45000, 12)
Target variable shape: (45000,)


*Train-Test Split*

Split the data into training and testing sets to evaluate model performance.


In [15]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Features Shape:", X_train.shape)
print("Testing Features Shape:", X_test.shape)


Training Features Shape: (36000, 12)
Testing Features Shape: (9000, 12)


*Build the Regression Model*

Use Linear Regression as the baseline model.

In [16]:
# Initialize the Linear Regression model
lr_model = LinearRegression()

# Train the model
lr_model.fit(X_train, y_train)

# Predict on the test set
y_pred = lr_model.predict(X_test)
print(y_pred)

[ 6105.35636106  9400.77027172  7717.59686002 ... 10326.44799947
 11604.51304414 12255.81555825]


*Model Evaluation*

In [17]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")
print("...............")
print(" In this regression model can not achive better accurecy so, Try to other regression model ")


Model Performance:
Mean Absolute Error (MAE): 2902.9273940110784
Root Mean Squared Error (RMSE): 4114.000970068255
R-squared (R²): 0.5795138750308679
...............
 In this regression model can not achive better accurecy so, Try to other regression model 


**Experiment with Other Regression Models**

You can experiment with advanced models to improve performance:

1. Decision Tree Regressor
2. Random Forest Regressor
3. XGBoost
4. Support Vector Regressor (SVR)

Example: *Random Forest Regressor*

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Initialize the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on training set
y_train_pred_rf = rf_model.predict(X_train)

# Predict on test set
y_test_pred_rf = rf_model.predict(X_test)

# Evaluate on training set
mae_train_rf = mean_absolute_error(y_train, y_train_pred_rf)
rmse_train_rf = np.sqrt(mean_squared_error(y_train, y_train_pred_rf))
r2_train_rf = r2_score(y_train, y_train_pred_rf)

print("Random Forest Training Performance:")
print(f"Mean Absolute Error (MAE): {mae_train_rf}")
print(f"Root Mean Squared Error (RMSE): {rmse_train_rf}")
print(f"R-squared (R²): {r2_train_rf}")

# Evaluate on testing set
mae_test_rf = mean_absolute_error(y_test, y_test_pred_rf)
rmse_test_rf = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))
r2_test_rf = r2_score(y_test, y_test_pred_rf)

print("\nRandom Forest Testing Performance:")
print(f"Mean Absolute Error (MAE): {mae_test_rf}")
print(f"Root Mean Squared Error (RMSE): {rmse_test_rf}")
print(f"R-squared (R²): {r2_test_rf}")


Random Forest Training Performance:
Mean Absolute Error (MAE): 54.02945583333333
Root Mean Squared Error (RMSE): 108.0984992526492
R-squared (R²): 0.9997062770959296

Random Forest Testing Performance:
Mean Absolute Error (MAE): 144.84298222222225
Root Mean Squared Error (RMSE): 267.4076074482549
R-squared (R²): 0.9982234787497727


In [19]:
results = pd.DataFrame({
    'Actual': y_test.values.flatten(), # Use .values to access the NumPy array
    'Predicted': y_pred.flatten()
})

print(results.head(5))
print(results.tail(5))

    Actual     Predicted
0   7500.0   6105.356361
1   9000.0   9400.770272
2   5000.0   7717.596860
3   5000.0  10215.143215
4  10000.0  10485.263776
       Actual     Predicted
8995  14000.0   9400.505556
8996   6725.0   7910.919978
8997  14000.0  10326.447999
8998   8000.0  11604.513044
8999   8500.0  12255.815558


In [20]:
print(feature_columns)

['person_age', 'person_gender', 'person_education', 'person_income', 'person_emp_exp', 'person_home_ownership', 'loan_intent', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score', 'previous_loan_defaults_on_file']


*Hear the example of the Predict the Loan_amount based on this feature variables,*

In [21]:
# Example input features
input_data = {
    'person_age': 35,
    'person_gender': 1,  # 1 for Male
    'person_education': 1,  # 1 for Bachelor
    'person_income': 50000,
    'person_emp_exp': 10,
    'person_home_ownership': 0,  # 0 for RENT
    'loan_intent': 3,  # 3 for PERSONAL
    'loan_int_rate': 12.5,
    'loan_percent_income': 0.2,
    'cb_person_cred_hist_length': 4,
    'credit_score': 750,
    'previous_loan_defaults_on_file': 0,  # 0 for No
}


In [22]:
import numpy as np

# Convert input data to a format compatible with the model
input_features = np.array([[22, 0, 3, 71948.0, 0, 0, 3, 16.02, 0.49, 3.0, 561, 0]])

# Predict loan amount
predicted_loan_amount = rf_model.predict(input_features)

print(f"Predicted Loan Amount: {predicted_loan_amount[0]:.2f}")


Predicted Loan Amount: 33492.23




*Example: XGBoost*

In [34]:
# Import required libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.svm import SVR

# Split the scaled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Function to evaluate model performance
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    return mae, rmse, r2, y_pred


# Train Decision Tree Regressor     =========================
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Predict on the test set
dt_pred = dt_model.predict(X_test)



# Train and Evaluate XGBoost       =========================
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_mae, xgb_rmse, xgb_r2, xgb_pred = evaluate_model(xgb_model, X_test, y_test)


# Train and Evaluate Support Vector Regressor        =========================
svr_model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
svr_model.fit(X_train, y_train)
svr_mae, svr_rmse, svr_r2, svr_pred = evaluate_model(svr_model, X_test, y_test)



# Evaluate Decision Tree model
dt_mae = mean_absolute_error(y_test, dt_pred)
dt_rmse = np.sqrt(mean_squared_error(y_test, dt_pred))
dt_r2 = r2_score(y_test, dt_pred)

# Print the metrics
print("Decision Tree Performance:")
print(f"Mean Absolute Error (MAE): {dt_mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {dt_rmse:.2f}")
print(f"R-squared (R²): {dt_r2:.2f}")


# Print Model Performances
print("XGBoost Performance:")
print(f"MAE: {xgb_mae:.4f}, RMSE: {xgb_rmse:.4f}, R²: {xgb_r2:.4f}")

print("\nSVR Performance:")
print(f"MAE: {svr_mae:.4f}, RMSE: {svr_rmse:.4f}, R²: {svr_r2:.4f}")


Decision Tree Performance:
Mean Absolute Error (MAE): 171.45
Root Mean Squared Error (RMSE): 378.03
R-squared (R²): 1.00
XGBoost Performance:
MAE: 214.7568, RMSE: 368.0443, R²: 0.9966

SVR Performance:
MAE: 1242.6659, RMSE: 2297.9120, R²: 0.8688


In [40]:
import pandas as pd

# Create a function to compare actual and predicted values
def compare_actual_predicted(y_test, y_pred):
    comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    return comparison_df


# Evaluate XGBoost
xgb_comparison = compare_actual_predicted(y_test.to_numpy(), xgb_pred)

# Evaluate SVR
svr_comparison = compare_actual_predicted(y_test.to_numpy(), svr_pred)

# Compare actual vs. predicted for Decision Tree
dt_comparison = compare_actual_predicted(y_test.to_numpy(), dt_pred)

# Display sample results for Decision Tree
print("Decision Tree Predictions:")
print(dt_comparison.head(10))  # Display first 10 rows

print("----------------------------")

print("Random Forest Regressor: ")
results = pd.DataFrame({
    'Actual': y_test.values.flatten(), # Use .values to access the NumPy array
    'Predicted': y_pred.flatten()
})

print(results.head(10))

print("-----------------------------")

# Display sample results for XGBoost
print("XGBoost Predictions:")
print(xgb_comparison.head(10))  # Display first 10 rows

print("-----------------------------")

# Display sample results for SVR
print("\nSVR Predictions:")
print(svr_comparison.head(10))  # Display first 10 rows


Decision Tree Predictions:
    Actual  Predicted
0   7500.0     7500.0
1   9000.0     9000.0
2   5000.0     5000.0
3   5000.0     4987.0
4  10000.0     9715.0
5   4500.0     4195.0
6  12000.0    12000.0
7   2800.0     2911.0
8  12000.0    12000.0
9  12000.0    12000.0
----------------------------
Random Forest Regressor: 
    Actual     Predicted
0   7500.0   6105.356361
1   9000.0   9400.770272
2   5000.0   7717.596860
3   5000.0  10215.143215
4  10000.0  10485.263776
5   4500.0   9697.698288
6  12000.0   9383.370045
7   2800.0   4742.792620
8  12000.0  10246.045977
9  12000.0  12155.112119
-----------------------------
XGBoost Predictions:
    Actual     Predicted
0   7500.0   7982.260742
1   9000.0   9257.614258
2   5000.0   4944.880859
3   5000.0   5087.195312
4  10000.0  10277.787109
5   4500.0   4667.875488
6  12000.0  12199.706055
7   2800.0   2797.085449
8  12000.0  12260.078125
9  12000.0  12091.782227
-----------------------------

SVR Predictions:
    Actual     Predicted
0 