Machine learning model to predict the loan amount after assessing over multiple parameters using multiple linear regression.

Here we import our modules needed and import the main data set

In [6]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
import sys

train_data=pd.read_csv('train.csv')

Engineering on the data set removing the uncessesary parameters and filling the null values, minimising the amount of sample rows removed

In [7]:
#clearing unecessary columns
train_data.drop(['Gender','Married','Self_Employed','Loan_Status','Loan_ID'],inplace=True,axis=1)


#fixing data set
#null values
mean_loan=round(train_data["LoanAmount"].mean())
mode_term=round(train_data["Loan_Amount_Term"].mode())
train_data["LoanAmount"].fillna(0,inplace=True)
train_data["Loan_Amount_Term"].fillna(mode_term,inplace=True)

train_data.loc[train_data['ApplicantIncome']>=30000,"ApplicantIncome"]=None
train_data.loc[train_data['CoapplicantIncome']>=15000,"CoapplicantIncome"]=None



#non numeric values
train_data.loc[train_data['Education']=="Graduate","Education"]=1
train_data.loc[train_data['Education']=="Not Graduate","Education"]=0

train_data.loc[train_data['Property_Area']=="Urban","Property_Area"]=1
train_data.loc[train_data['Property_Area']=="Semiurban","Property_Area"]=0.5
train_data.loc[train_data['Property_Area']=="Rural","Property_Area"]=0

train_data.loc[train_data['Dependents']=="3+","Dependents"]=3

train_data.dropna(inplace=True)
train_data.reset_index(drop=True,inplace=True)

#scaling
mean_app = round(train_data["ApplicantIncome"].mean())
diff_app = train_data["ApplicantIncome"].max()-train_data["ApplicantIncome"].min()
mean_co = round(train_data["CoapplicantIncome"].mean())
diff_co = train_data["CoapplicantIncome"].max()-train_data["CoapplicantIncome"].min()
mean_lat = round(train_data["Loan_Amount_Term"].mean())
diff_lat = train_data["Loan_Amount_Term"].max()-train_data["Loan_Amount_Term"].min()

for i in range(0,len(train_data)):
    
    
    not_food=train_data.loc[i,"CoapplicantIncome"]
    train_data.loc[i,"CoapplicantIncome"]=(not_food-mean_co)/diff_co
    
    food=train_data.loc[i,"ApplicantIncome"]
    train_data.loc[i,"ApplicantIncome"]=(food-mean_app)/diff_app
    
    too_food=train_data.loc[i,"Loan_Amount_Term"]
    train_data.loc[i,"Loan_Amount_Term"]=(too_food-mean_lat)/diff_lat



train_data = train_data.astype({"Education":'int64', "LoanAmount":'int64', "Credit_History":'int64',
                                "Dependents":'int64'}) 


print(train_data.to_string())

     Dependents  Education  ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History Property_Area
0             0          1         0.038304          -0.125929           0          0.040541               1             1
1             1          1        -0.015220           0.007522         128          0.040541               1             0
2             0          1        -0.082146          -0.125929          66          0.040541               1             1
3             0          0        -0.099776           0.082743         120          0.040541               1             1
4             0          1         0.044688          -0.125929         141          0.040541               1             1
5             2          1         0.020040           0.245398         267          0.040541               1             1
6             0          0        -0.110345           0.008230          95          0.040541               1             1
7             3 

The final dataset is converted into numpy array to compute further and also declaring our initial weights and biases

In [21]:
np.set_printoptions(threshold=sys.maxsize)

df=train_data.drop(['LoanAmount'],axis=1)
inp=df.to_numpy()

out=train_data["LoanAmount"].to_numpy()
weight=np.array([1,1,10,10,10,1,1])
bias=20
alpha=0.5

A function to calculate cost function, which is used to assessed how accurate the model is. the nearer it is to 0 the better our model is predictin.



In [22]:
def compute_cost():
    counter=0
    error=0
    while counter<len(inp):
        error= error + ((np.dot(inp[counter],weight)+bias-out[counter]))**2
        counter=counter+1
    return error/len(out)
print(compute_cost())

19372.225508285115


Here is the code for the gradient decent on every iteration simultaneously updating the values of the weights and biases

In [23]:
error_list=[]

for epoch in range(0,500):
    
    error_list.append(compute_cost())
    print(compute_cost())
    dj_db=0
    op=0
    for op in range(0,len(out)):
        dj_db=dj_db+(((np.dot(inp[op],weight))+bias)-out[op])
    dj_db=dj_db/len(out)
    temp_b=bias-(alpha*dj_db)
    
    
    
    l=[]
    dj=[]
    k=0
    while k<len(weight):
        dj_dw=0
        for m in range(0,len(out)):
            dj_dw += inp[m,k]*(np.dot(inp[m],weight)+bias-out[m])
        dj_dw /= len(out)
        dj.append(dj_dw)
        temp_wk=weight[k]-(alpha*dj_dw)
        l.append(temp_wk)
        k=k+1
    
    temp_w=np.array(l)
    weight=temp_w
    bias=temp_b
    
print("weight=",weight)
print("bias=",bias)

19372.225508285115
13629.794527742637
10360.229867801761
8469.775908651753
7359.826044733896
6696.000409346138
6289.017865585461
6030.812647482916
5859.338141518833
5738.826561134484
5648.5905629553545
5576.6251639125085
5515.9359783614855
5462.426086455748
5413.677595923322
5368.248079026876
5325.263788501902
5284.184131257155
5244.665054970618
5206.4796122917605
5169.471609731028
5133.528425069694
5098.564950620803
5064.514010929537
5031.3205622706055
4998.938113121839
4967.326459117031
4936.450204517144
4906.277761446724
4876.780645267773
4847.932958343498
4819.710997513497
4792.092945822429
4765.058623908624
4738.589285300593
4712.667445196106
4687.276735564459
4662.401781460443
4638.028094760809
4614.141982416576
4590.7304669264095
4567.781217177112
4545.282488126881
4523.22306806155
4501.592232355907
4480.379702836166
4459.57561197476
4439.1704712609235
4419.155143185671
4399.520816359736
4380.258983351096
4361.361420887056
4342.820172115302
4324.627530660693
4306.776026251196
42

3330.209010225077
3330.1932755124635
3330.1778159912105
3330.1626268467685
3330.1477033488522
3330.1330408499743
3330.1186347839803
3330.1044806646396
3330.0905740842277
3330.0769107121796
3330.0634862937027
3330.0502966484805
3330.0373376693533
3330.02460532104
3330.012095638885
3329.999804727613
3329.9877287601166
3329.9758639762617
3329.96420668173
3329.952753246841
3329.941500105443
3329.930443753784
3329.9195807494284
3329.908907710189
3329.898421313045
3329.888118293155
3329.8779954427855
3329.868049610345
3329.858277699387
3329.8486766676597
3329.839243526124
3329.829975338074
3329.8208692181597
3329.811922331539
3329.8031318929766
3329.794495165955
3329.786009461859
3329.7776721391115
3329.76948060234
3329.761432301604
3329.7535247315723
3329.7457554307575
3329.7381219807203
3329.7306220053592
3329.7232531701366
3329.716013181362
3329.7088997854726
3329.701910768331
3329.6950439545576
3329.688297206808
3329.6816684251603
weight= [  6.59932421  10.73151361 325.30088083 198.85565

After our model has been trained we now import our test dataset to confirm the validity of our dataset

In [467]:
test_data=pd.read_csv('test.csv')


#clearing unecessary columns
test_data.drop(['Gender','Married','Self_Employed','Loan_ID'],inplace=True,axis=1)


#fixing data set
#null values
mean_loan=round(test_data["LoanAmount"].mean())
mode_term=round(test_data["Loan_Amount_Term"].mode())
test_data["LoanAmount"].fillna(0,inplace=True)
test_data["Loan_Amount_Term"].fillna(mode_term,inplace=True)

#non numeric values
test_data.loc[test_data['Education']=="Graduate","Education"]=1
test_data.loc[test_data['Education']=="Not Graduate","Education"]=0

test_data.loc[test_data['Property_Area']=="Urban","Property_Area"]=1
test_data.loc[test_data['Property_Area']=="Semiurban","Property_Area"]=0.5
test_data.loc[test_data['Property_Area']=="Rural","Property_Area"]=0

test_data.loc[test_data['Dependents']=="3+","Dependents"]=3

test_data.dropna(inplace=True)
test_data.reset_index(drop=True,inplace=True)

#scaling
mean_app = round(test_data["ApplicantIncome"].mean())
diff_app = test_data["ApplicantIncome"].max()-test_data["ApplicantIncome"].min()
mean_co = round(test_data["CoapplicantIncome"].mean())
diff_co = test_data["CoapplicantIncome"].max()-test_data["CoapplicantIncome"].min()
mean_lat = round(test_data["Loan_Amount_Term"].mean())
diff_lat = test_data["Loan_Amount_Term"].max()-test_data["Loan_Amount_Term"].min()

for i in range(0,len(test_data)):
    
    
    not_food=test_data.loc[i,"CoapplicantIncome"]
    test_data.loc[i,"CoapplicantIncome"]=(not_food-mean_co)/diff_co
    
    food=test_data.loc[i,"ApplicantIncome"]
    test_data.loc[i,"ApplicantIncome"]=(food-mean_app)/diff_app
    
    too_food=test_data.loc[i,"Loan_Amount_Term"]
    test_data.loc[i,"Loan_Amount_Term"]=(too_food-mean_lat)/diff_lat



test_data = test_data.astype({"Education":'int64', "LoanAmount":'int64', "Credit_History":'int64',
                                "Dependents":'int64'}) 


print(test_data.to_string())

     Dependents  Education  ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History Property_Area
0             0          1         0.013250          -0.063833         110          0.035865               1             1
1             1          1        -0.023205          -0.001333         126          0.035865               1             1
2             2          1         0.003323           0.011167         208          0.035865               1             1
3             0          0        -0.020447          -0.063833          78          0.035865               1             1
4             0          0        -0.035765           0.078750         152          0.035865               1             1
5             1          0        -0.034924          -0.063833          59          0.035865               1           0.5
6             2          0        -0.012106          -0.063833         147          0.035865               0             0
7             2 

converting our test dataset into numpy array.

In [457]:
df2=test_data.drop(['LoanAmount'],axis=1)
input_test=df2.to_numpy()

output_test=test_data["LoanAmount"].tolist()


Predicting values on our test data.

In [468]:
pred_val=[]
for i in range(0,len(test_data)):
    fwb=round(np.dot(input_test[i],weight)+round(bias))
    pred_val.append(fwb)
    print("predicted=",fwb,"actual=",output_test[i])

predicted= 123 actual= 110
predicted= 130 actual= 126
predicted= 148 actual= 208
predicted= 101 actual= 78
predicted= 124 actual= 152
predicted= 109 actual= 59
predicted= 128 actual= 147
predicted= 160 actual= 280
predicted= 123 actual= 123
predicted= 100 actual= 90
predicted= 132 actual= 162
predicted= 170 actual= 166
predicted= 124 actual= 124
predicted= 129 actual= 131
predicted= 155 actual= 200
predicted= 142 actual= 126
predicted= 204 actual= 300
predicted= 120 actual= 100
predicted= 115 actual= 48
predicted= 98 actual= 28
predicted= 115 actual= 101
predicted= 103 actual= 125
predicted= 158 actual= 290
predicted= 307 actual= 148
predicted= 162 actual= 275
predicted= 113 actual= 125
predicted= 103 actual= 75
predicted= 145 actual= 192
predicted= 131 actual= 152
predicted= 132 actual= 158
predicted= 122 actual= 101
predicted= 117 actual= 176
predicted= 159 actual= 185
predicted= 130 actual= 90
predicted= 123 actual= 116
predicted= 130 actual= 138
predicted= 129 actual= 100
predicted

Using sklearn to find the f1 score of our model

End of code