## Importing Modules

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import os
import pickle

## ImportIng DataSet

In [2]:
data=pd.DataFrame()
for i in os.listdir():
    if ".csv" in i:
        data=pd.read_csv(i)
    elif ".xlsx" in i:
        details=pd.read_excel(i)

## Summary On Dataset

1. data - Loan_Club_Data
2. details - Info on each Column from Loan_Club_Data

## Goal

We have to Predict the Interest Rate for Prediction of Loan Payment

Following are the which affect the Interest rate:
1. Amount to be credited for Loan
2. Reason of Loan
3. Applicants Job
4. Applicants Job Duration
5. Term to Repay the Loan

## Extracting Required data

In [3]:
df=data[['loan_amnt', 'int_rate', 'emp_title', 'emp_length', 'purpose', 'term', 'home_ownership','issue_d']]

In [4]:
df.shape

(2260668, 8)

In [5]:
df.isnull().sum()

loan_amnt              0
int_rate               0
emp_title         166969
emp_length        146907
purpose                0
term                   0
home_ownership         0
issue_d                0
dtype: int64

## Filling Null Values

In [6]:
null=['emp_title','emp_length']

In [7]:
for i in null:
    df.loc[df[i].isnull(),i]=data[i].mode()[0]

In [8]:
df.isnull().sum()

loan_amnt         0
int_rate          0
emp_title         0
emp_length        0
purpose           0
term              0
home_ownership    0
issue_d           0
dtype: int64

Now, that data is cleaned We can move forward to encode and seperating the data into x and y

## Encoding the data

In [28]:
df['issue_d']=df['issue_d'].str.split('-').str[1]

In [32]:
df['issue_d']=df['issue_d'].astype('int')

In [33]:
one=[]
ordinal=[]
num=[]
for i in df.select_dtypes('object').columns:
    if i=='loan_status':
        continue
    elif len(df[i].value_counts())<=6:
        one.append(i)
    else:
        ordinal.append(i)
for i in df.select_dtypes(exclude='object').columns:
    num.append(i)
print('one hot encode : ', one)
print()
print('ordinal encode : ', ordinal)
print()
print('continuos : ', num)

one hot encode :  ['term', 'home_ownership']

ordinal encode :  ['emp_title', 'emp_length', 'purpose']

continuos :  ['loan_amnt', 'int_rate', 'issue_d']


Ordinal Encoding to those columns which contains more unique values than 6

In [34]:
from sklearn.preprocessing import OrdinalEncoder
oe=OrdinalEncoder()
df[ordinal]=oe.fit_transform(df[ordinal])

One Hot Encoding to those columns which contains less unique values than 6

In [35]:
temp=pd.get_dummies(df, columns = one)

In [36]:
df.drop(one,axis=1,inplace=True)

In [37]:
df[temp.columns]=temp

In [38]:
df.shape

(2260668, 14)

## Seperating the Data into X and Y

In [39]:
x=df.loc[:,df.columns!='int_rate']

In [40]:
y=df.loc[:,'int_rate']

In [41]:
x

Unnamed: 0,loan_amnt,emp_title,emp_length,purpose,issue_d,term_ 36 months,term_ 60 months,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT
0,2500,72085.0,1.0,2.0,2018,1,0,0,0,0,0,0,1
1,30000,299234.0,1.0,2.0,2018,0,1,0,1,0,0,0,0
2,5000,15072.0,6.0,2.0,2018,1,0,0,1,0,0,0,0
3,4000,195310.0,1.0,2.0,2018,1,0,0,1,0,0,0,0
4,30000,249161.0,1.0,2.0,2018,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2260663,12000,474556.0,1.0,2.0,2017,0,1,0,1,0,0,0,0
2260664,12000,371350.0,10.0,2.0,2017,0,1,0,1,0,0,0,0
2260665,10000,406222.0,1.0,2.0,2017,1,0,0,0,0,0,1,0
2260666,12000,406222.0,1.0,2.0,2017,0,1,0,0,0,0,0,1


In [63]:
y.mean()

13.09291294431558

## Spliting data into Training and Testing

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=68)

## Building the Base Model

Goal of this model is to predict the Interest Rate after providing it specific data

In [45]:
from sklearn.linear_model import LinearRegression

In [46]:
lr=LinearRegression()

In [47]:
lr.fit(xtrain,ytrain)

In [48]:
ytest_pred=lr.predict(xtest)

In [49]:
ytrain_pred=lr.predict(xtrain)

## Accuracy

In [50]:
from sklearn.metrics import r2_score

In [51]:
r2_score(ytest,ytest_pred)

0.16943429594778103

In [52]:
r2_score(ytrain,ytrain_pred)

0.16965852357557842

Accuracy is too low to Improve the accuracy we have to try diffrent ML algoritms

### XG Boost

In [56]:
from xgboost import XGBRegressor

In [57]:
xg = XGBRegressor(tree_method='gpu_hist')

In [58]:
xg.fit(xtrain, ytrain)

In [59]:
ytest_pred=xg.predict(xtest)

In [60]:
r2_score(ytest,ytest_pred)

0.2653662538417406

In [61]:
ytrain_pred=xg.predict(xtrain)

In [62]:
r2_score(ytrain,ytrain_pred)

0.2704561888839666

### Svm Regressor

In [74]:
from sklearn.svm import SVR

In [75]:
svm=SVR()

In [None]:
svm.fit(xtrain, ytrain)

In [None]:
ytest_pred=svm.predict(xtest)

In [None]:
r2_score(ytest,ytest_pred)

We can cnclude that there isnt enough data 