### 1.Load data files(train_u6lujuX_CVtuZ9i.csv and test_Y3wMUE5_7gLdaTN.csv)

In [1]:
import pandas as pd
import numpy as np

In [2]:
train=pd.read_csv('train_4.csv')

In [3]:
test=pd.read_csv('test_4.csv')

In [4]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
train['Dependents'].replace('3+',3, inplace=True)


### 2.Types of data columns

In [6]:
train.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

### DATA CLEANING AND PREPROCESSING

### 3.Find missing values

In [7]:
train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [8]:
test.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

### 4.Impute missing values with mean (numerical variables)

In [9]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [10]:
x=train[['LoanAmount','Loan_Amount_Term','Credit_History']]


In [11]:
imputer.fit(x)
X=imputer.transform(x)

### 5.Impute missing values with mode (categorical variables)

In [12]:
imputer_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [13]:
Y=train[['Gender','Married','Self_Employed']]

In [14]:

imputer_mode.fit(Y)
y=imputer_mode.transform(Y)


### PREDICTIVE MODELLING

### 6.Remove Loan_ID variable - Irrelevant

In [15]:
train.drop('Loan_ID',inplace=True,axis=1)
train


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


### 7.Create target variable

In [16]:
Features=train.drop('LoanAmount',axis=1)

Features

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,360.0,1.0,Rural,Y
610,Male,Yes,3,Graduate,No,4106,0.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,360.0,1.0,Urban,Y


In [17]:
Target=train['LoanAmount']
Target


0        NaN
1      128.0
2       66.0
3      120.0
4      141.0
       ...  
609     71.0
610     40.0
611    253.0
612    187.0
613    133.0
Name: LoanAmount, Length: 614, dtype: float64

### 8.Build dummy variables for categorical variables

In [18]:
gender= pd.get_dummies(train['Gender'])
married=pd.get_dummies(train['Married'])
education=pd.get_dummies(train['Education'])
selfemployed=pd.get_dummies(train['Self_Employed'])

In [19]:
train.drop(['Gender','Married','Education','Self_Employed'],axis=1,inplace=True)

In [20]:
train1= pd.concat([train,gender,married,education,selfemployed],axis=1)
train1['Dependents'] = train1.Dependents.replace(np.NaN, 0)


In [21]:
train1['LoanAmount']=train1.LoanAmount.replace(np.NaN,146)

In [22]:
train1['Loan_Amount_Term']=train1.Loan_Amount_Term.replace(np.NaN,train1.Loan_Amount_Term.mean())

In [23]:
train1['Credit_History']=train1.Credit_History.replace(np.NaN,1)

In [24]:
train1.isnull().sum()

Dependents           0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
Female               0
Male                 0
No                   0
Yes                  0
Graduate             0
Not Graduate         0
No                   0
Yes                  0
dtype: int64

### 9.Split train data for cross validation

In [25]:
y

array([['Male', 'No', 'No'],
       ['Male', 'Yes', 'No'],
       ['Male', 'Yes', 'Yes'],
       ...,
       ['Male', 'Yes', 'No'],
       ['Male', 'Yes', 'No'],
       ['Female', 'No', 'Yes']], dtype=object)

### (a)LOGISTIC REGRESSION ALGORITHM

### 10.Fit model

In [26]:
X=train1.drop(['CoapplicantIncome','Property_Area','Loan_Status'],axis=1)
Y=train1['Loan_Status']
Y

0      Y
1      N
2      Y
3      Y
4      Y
      ..
609    Y
610    Y
611    Y
612    Y
613    N
Name: Loan_Status, Length: 614, dtype: object

In [27]:
X

Unnamed: 0,Dependents,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Female,Male,No,Yes,Graduate,Not Graduate,No.1,Yes.1
0,0,5849,146.0,360.0,1.0,0,1,1,0,1,0,1,0
1,1,4583,128.0,360.0,1.0,0,1,0,1,1,0,1,0
2,0,3000,66.0,360.0,1.0,0,1,0,1,1,0,0,1
3,0,2583,120.0,360.0,1.0,0,1,0,1,0,1,1,0
4,0,6000,141.0,360.0,1.0,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,2900,71.0,360.0,1.0,1,0,1,0,1,0,1,0
610,3,4106,40.0,180.0,1.0,0,1,0,1,1,0,1,0
611,1,8072,253.0,360.0,1.0,0,1,0,1,1,0,1,0
612,2,7583,187.0,360.0,1.0,0,1,0,1,1,0,1,0


In [28]:
from sklearn.model_selection import train_test_split 
X_train, X_test, Y_train, Y_test=train_test_split(X,Y, test_size=0.30, random_state=42)
X_train.isnull().sum()

Dependents          0
ApplicantIncome     0
LoanAmount          0
Loan_Amount_Term    0
Credit_History      0
Female              0
Male                0
No                  0
Yes                 0
Graduate            0
Not Graduate        0
No                  0
Yes                 0
dtype: int64

In [29]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

### 11.Predict values for cv data

In [30]:
predictions = logmodel.predict(X_test)

### 12.Print classification report

In [31]:
from sklearn.metrics import classification_report

In [32]:
print(classification_report(Y_test,predictions))

              precision    recall  f1-score   support

           N       0.93      0.42      0.57        65
           Y       0.76      0.98      0.86       120

    accuracy                           0.78       185
   macro avg       0.84      0.70      0.71       185
weighted avg       0.82      0.78      0.76       185



### 13.Evaluate accuracy of model

In [33]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc_auc_score(Y_test,predictions)

ValueError: could not convert string to float: 'Y'

In [None]:
y_test=pd.get_dummies(Y_test)

In [None]:
prediction=pd.get_dummies(predictions)

In [None]:
roc_auc_score(y_test,prediction)

### (b)DECISION TREE ALGORITHM

### 14.Fit model

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc = dtc.fit(X_train,Y_train)

### 15.Predict values for cv data

In [None]:
Y_pred = dtc.predict(X_test)

### 16.Evaluate accuracy of model

In [None]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))

### (c)SUPPORT VECTOR MACHINE (SVM) ALGORITHM

### 17.Fit model

In [None]:
from sklearn import svm
SVM = svm.SVC(kernel='linear')
SVM.fit(X_train, Y_train)

### 18.Predict values for cv data

In [None]:
y_preds = SVM.predict(X_test)

### 19.Evaluate accuracy of model

In [None]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_preds))

### (d)NAIVE BAYES ALGORITHM

### 20.Fit model

In [None]:
from sklearn.naive_bayes import GaussianNB
NBmodel = GaussianNB()
NBmodel.fit(X_train, Y_train)

### 21.Predict values for cv data

In [None]:
predicted = NBmodel.predict(X_test)

### 22.Evaluate accuracy of model

In [None]:
from sklearn.metrics import accuracy_score
print('Accuracy Score for Naive Bayes:',accuracy_score(Y_test,predicted) )