In [1]:
import warnings
import numpy as np
import pandas as pd
from utility import Univariate
from sklearn.utils import shuffle  
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
warnings.filterwarnings("ignore")

In [3]:
dataset = pd.read_csv("Loan_default.csv")

In [4]:
dataset

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56.0,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69.0,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46.0,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32.0,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60.0,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255342,8C6S86ESGC,19.0,37979,210682,541,109,4,14.11,12,0.85,Bachelor's,Full-time,Married,No,No,Other,No,0
255343,98R4KDHNND,32.0,51953,189899,511,14,2,11.55,24,0.21,High School,Part-time,Divorced,No,No,Home,No,1
255344,XQK1UUUNGP,56.0,84820,208294,597,70,3,5.29,60,0.50,High School,Self-employed,Married,Yes,Yes,Auto,Yes,0
255345,JAO28CPL4H,42.0,85109,60575,809,40,1,20.90,48,0.44,High School,Part-time,Single,Yes,Yes,Other,No,0


In [5]:
dataset.drop(["LoanID","MonthsEmployed","NumCreditLines","DTIRatio","HasMortgage","HasDependents","HasCoSigner"],axis = 1,inplace = True)

In [6]:
dataset

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Education,EmploymentType,MaritalStatus,LoanPurpose,Default
0,56.0,85994,50587,520,15.23,36,Bachelor's,Full-time,Divorced,Other,0
1,69.0,50432,124440,458,4.81,60,Master's,Full-time,Married,Other,0
2,46.0,84208,129188,451,21.17,24,Master's,Unemployed,Divorced,Auto,1
3,32.0,31713,44799,743,7.07,24,High School,Full-time,Married,Business,0
4,60.0,20437,9139,633,6.51,48,Bachelor's,Unemployed,Divorced,Auto,0
...,...,...,...,...,...,...,...,...,...,...,...
255342,19.0,37979,210682,541,14.11,12,Bachelor's,Full-time,Married,Other,0
255343,32.0,51953,189899,511,11.55,24,High School,Part-time,Divorced,Home,1
255344,56.0,84820,208294,597,5.29,60,High School,Self-employed,Married,Auto,0
255345,42.0,85109,60575,809,20.90,48,High School,Part-time,Single,Other,0


In [7]:
dataset.describe()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Default
count,255344.0,255347.0,255347.0,255347.0,255347.0,255347.0,255347.0
mean,43.498473,82499.304597,127578.865512,574.264346,13.492773,36.025894,0.116128
std,14.990245,38963.013729,70840.706142,158.903867,6.636443,16.96933,0.320379
min,18.0,15000.0,5000.0,300.0,2.0,12.0,0.0
25%,31.0,48825.5,66156.0,437.0,7.77,24.0,0.0
50%,43.0,82466.0,127556.0,574.0,13.46,36.0,0.0
75%,56.0,116219.0,188985.0,712.0,19.25,48.0,0.0
max,69.0,149999.0,249999.0,849.0,25.0,60.0,1.0


In [8]:
dataset["Default"].value_counts()

Default
0    225694
1     29653
Name: count, dtype: int64

In [9]:
yes_data = dataset[dataset["Default"] == 1]
no_data  = dataset[dataset["Default"] == 0]

In [10]:
min_size = min(len(yes_data), len(no_data))
min_size

29653

In [11]:
min_yes_data = yes_data.sample(min_size,random_state = 42)
min_no_data  = no_data.sample(min_size,random_state = 42)

In [12]:
balanced_data = pd.concat([min_yes_data,min_no_data])

In [13]:
balanced_data=  shuffle(balanced_data, random_state=42)

In [14]:
final_yes = balanced_data[balanced_data['Default'] == 1].sample(n=1000, random_state=42)
final_no = balanced_data[balanced_data['Default'] == 0].sample(n=1000, random_state=42)

In [15]:
new_reduced_data = pd.concat([final_yes, final_no])

In [16]:
new_reduced_data = shuffle(new_reduced_data, random_state = 42)

In [17]:
new_reduced_data["Default"].value_counts()

Default
0    1000
1    1000
Name: count, dtype: int64

In [18]:
balanced_dataset = new_reduced_data

# Preprocessing

In [19]:
balanced_dataset.isnull().sum()

Age               0
Income            0
LoanAmount        0
CreditScore       0
InterestRate      0
LoanTerm          0
Education         0
EmploymentType    0
MaritalStatus     0
LoanPurpose       0
Default           0
dtype: int64

In [20]:
balanced_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, 53411 to 28042
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             2000 non-null   float64
 1   Income          2000 non-null   int64  
 2   LoanAmount      2000 non-null   int64  
 3   CreditScore     2000 non-null   int64  
 4   InterestRate    2000 non-null   float64
 5   LoanTerm        2000 non-null   int64  
 6   Education       2000 non-null   object 
 7   EmploymentType  2000 non-null   object 
 8   MaritalStatus   2000 non-null   object 
 9   LoanPurpose     2000 non-null   object 
 10  Default         2000 non-null   int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 187.5+ KB


In [21]:
independent = balanced_dataset.drop("Default", axis = 1)
dependent = balanced_dataset["Default"]

In [22]:
independent

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Education,EmploymentType,MaritalStatus,LoanPurpose
53411,35.0,131051,31950,714,10.74,60,High School,Unemployed,Married,Business
706,23.0,19611,213162,809,19.94,12,High School,Self-employed,Single,Auto
23982,68.0,80224,100082,383,19.65,36,Bachelor's,Unemployed,Divorced,Auto
232117,34.0,85672,245258,821,22.20,24,Bachelor's,Part-time,Divorced,Home
179556,18.0,22901,124893,454,5.03,48,Master's,Full-time,Single,Business
...,...,...,...,...,...,...,...,...,...,...
184600,20.0,82103,174239,734,12.63,48,Bachelor's,Self-employed,Single,Education
191462,54.0,107380,22386,362,19.06,48,High School,Full-time,Single,Auto
52610,29.0,94936,45806,313,24.81,48,PhD,Full-time,Single,Home
59994,41.0,48829,75842,463,16.32,48,PhD,Part-time,Divorced,Other


In [23]:
#Splitting into Numerical and Categorical Data
numerical_data = independent.select_dtypes(include = [np.number])
categorical_data = independent.select_dtypes(exclude = [np.number])

In [24]:
categorical_data

Unnamed: 0,Education,EmploymentType,MaritalStatus,LoanPurpose
53411,High School,Unemployed,Married,Business
706,High School,Self-employed,Single,Auto
23982,Bachelor's,Unemployed,Divorced,Auto
232117,Bachelor's,Part-time,Divorced,Home
179556,Master's,Full-time,Single,Business
...,...,...,...,...
184600,Bachelor's,Self-employed,Single,Education
191462,High School,Full-time,Single,Auto
52610,PhD,Full-time,Single,Home
59994,PhD,Part-time,Divorced,Other


In [25]:
numerical_data

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm
53411,35.0,131051,31950,714,10.74,60
706,23.0,19611,213162,809,19.94,12
23982,68.0,80224,100082,383,19.65,36
232117,34.0,85672,245258,821,22.20,24
179556,18.0,22901,124893,454,5.03,48
...,...,...,...,...,...,...
184600,20.0,82103,174239,734,12.63,48
191462,54.0,107380,22386,362,19.06,48
52610,29.0,94936,45806,313,24.81,48
59994,41.0,48829,75842,463,16.32,48


In [26]:
imputer = SimpleImputer(strategy = "mean")

In [27]:
imputer.fit(numerical_data[::])

In [28]:
numerical_filled_data = pd.DataFrame(imputer.fit_transform(numerical_data), columns = numerical_data.columns)

In [29]:
numerical_filled_data.isnull().sum()

Age             0
Income          0
LoanAmount      0
CreditScore     0
InterestRate    0
LoanTerm        0
dtype: int64

In [30]:
categorical_imputer = SimpleImputer(strategy = "most_frequent")

In [31]:
categorical_filled_data = pd.DataFrame(categorical_imputer.fit_transform(categorical_data), columns = categorical_data.columns)

In [32]:
categorical_filled_data.isnull().sum()

Education         0
EmploymentType    0
MaritalStatus     0
LoanPurpose       0
dtype: int64

In [33]:
independent = pd.concat([numerical_filled_data,categorical_filled_data], axis = 1)

In [34]:
dependent


53411     0
706       1
23982     0
232117    1
179556    0
         ..
184600    0
191462    0
52610     1
59994     0
28042     0
Name: Default, Length: 2000, dtype: int64

In [35]:
independent = independent.reset_index(drop=True)
dependent = dependent.reset_index(drop=True)

In [36]:
preprocessed_dataset = pd.concat([independent, dependent],axis = 1)
preprocessed_dataset

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Education,EmploymentType,MaritalStatus,LoanPurpose,Default
0,35.0,131051.0,31950.0,714.0,10.74,60.0,High School,Unemployed,Married,Business,0
1,23.0,19611.0,213162.0,809.0,19.94,12.0,High School,Self-employed,Single,Auto,1
2,68.0,80224.0,100082.0,383.0,19.65,36.0,Bachelor's,Unemployed,Divorced,Auto,0
3,34.0,85672.0,245258.0,821.0,22.20,24.0,Bachelor's,Part-time,Divorced,Home,1
4,18.0,22901.0,124893.0,454.0,5.03,48.0,Master's,Full-time,Single,Business,0
...,...,...,...,...,...,...,...,...,...,...,...
1995,20.0,82103.0,174239.0,734.0,12.63,48.0,Bachelor's,Self-employed,Single,Education,0
1996,54.0,107380.0,22386.0,362.0,19.06,48.0,High School,Full-time,Single,Auto,0
1997,29.0,94936.0,45806.0,313.0,24.81,48.0,PhD,Full-time,Single,Home,1
1998,41.0,48829.0,75842.0,463.0,16.32,48.0,PhD,Part-time,Divorced,Other,0


In [37]:
# Converting Categorical to Numerical Dataset
independent = pd.get_dummies(independent,dtype= int,drop_first = True)
independent

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Education_High School,Education_Master's,Education_PhD,EmploymentType_Part-time,EmploymentType_Self-employed,EmploymentType_Unemployed,MaritalStatus_Married,MaritalStatus_Single,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other
0,35.0,131051.0,31950.0,714.0,10.74,60.0,1,0,0,0,0,1,1,0,1,0,0,0
1,23.0,19611.0,213162.0,809.0,19.94,12.0,1,0,0,0,1,0,0,1,0,0,0,0
2,68.0,80224.0,100082.0,383.0,19.65,36.0,0,0,0,0,0,1,0,0,0,0,0,0
3,34.0,85672.0,245258.0,821.0,22.20,24.0,0,0,0,1,0,0,0,0,0,0,1,0
4,18.0,22901.0,124893.0,454.0,5.03,48.0,0,1,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,20.0,82103.0,174239.0,734.0,12.63,48.0,0,0,0,0,1,0,0,1,0,1,0,0
1996,54.0,107380.0,22386.0,362.0,19.06,48.0,1,0,0,0,0,0,0,1,0,0,0,0
1997,29.0,94936.0,45806.0,313.0,24.81,48.0,0,0,1,0,0,0,0,1,0,0,1,0
1998,41.0,48829.0,75842.0,463.0,16.32,48.0,0,0,1,1,0,0,0,0,0,0,0,1


In [38]:
preprocessed_dataset_numerical = pd.concat([independent, dependent],axis = 1)
preprocessed_dataset_numerical

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Education_High School,Education_Master's,Education_PhD,EmploymentType_Part-time,EmploymentType_Self-employed,EmploymentType_Unemployed,MaritalStatus_Married,MaritalStatus_Single,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other,Default
0,35.0,131051.0,31950.0,714.0,10.74,60.0,1,0,0,0,0,1,1,0,1,0,0,0,0
1,23.0,19611.0,213162.0,809.0,19.94,12.0,1,0,0,0,1,0,0,1,0,0,0,0,1
2,68.0,80224.0,100082.0,383.0,19.65,36.0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,34.0,85672.0,245258.0,821.0,22.20,24.0,0,0,0,1,0,0,0,0,0,0,1,0,1
4,18.0,22901.0,124893.0,454.0,5.03,48.0,0,1,0,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,20.0,82103.0,174239.0,734.0,12.63,48.0,0,0,0,0,1,0,0,1,0,1,0,0,0
1996,54.0,107380.0,22386.0,362.0,19.06,48.0,1,0,0,0,0,0,0,1,0,0,0,0,0
1997,29.0,94936.0,45806.0,313.0,24.81,48.0,0,0,1,0,0,0,0,1,0,0,1,0,1
1998,41.0,48829.0,75842.0,463.0,16.32,48.0,0,0,1,1,0,0,0,0,0,0,0,1,0


# Univariate Analysis

In [39]:
preprocessed_dataset_numerical.describe()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Education_High School,Education_Master's,Education_PhD,EmploymentType_Part-time,EmploymentType_Self-employed,EmploymentType_Unemployed,MaritalStatus_Married,MaritalStatus_Single,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other,Default
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,40.327,80912.5205,135065.545,564.9695,14.325215,36.048,0.2645,0.2305,0.249,0.2535,0.2505,0.263,0.3265,0.324,0.2025,0.206,0.176,0.197,0.5
std,15.23457,39531.073778,70734.401788,161.997192,6.616853,16.893916,0.441177,0.421258,0.432542,0.435123,0.433409,0.440373,0.46905,0.468117,0.401963,0.404532,0.380915,0.397832,0.500125
min,18.0,15034.0,5176.0,300.0,2.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,45607.5,75168.5,420.75,8.85,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,38.0,81405.5,139448.5,560.0,14.76,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
75%,53.0,114765.75,198259.5,705.0,19.9325,48.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
max,69.0,149986.0,249912.0,849.0,24.99,60.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [40]:
obj = Univariate()
tble = obj.getUnivarateTbl(preprocessed_dataset_numerical, preprocessed_dataset_numerical.columns)
tble

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Education_High School,Education_Master's,Education_PhD,EmploymentType_Part-time,EmploymentType_Self-employed,EmploymentType_Unemployed,MaritalStatus_Married,MaritalStatus_Single,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other,Default
Mean,40.327,80912.5205,135065.545,564.9695,14.325215,36.048,0.2645,0.2305,0.249,0.2535,0.2505,0.263,0.3265,0.324,0.2025,0.206,0.176,0.197,0.5
Median,38.0,81405.5,139448.5,560.0,14.76,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
Mode,27.0,15098.0,22099.0,309.0,15.87,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q1:25%,27.0,45607.5,75168.5,420.75,8.85,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q2:50%,38.0,81405.5,139448.5,560.0,14.76,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
Q3:75%,53.0,114765.75,198259.5,705.0,19.9325,48.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
Q4:100%,69.0,149986.0,249912.0,849.0,24.99,60.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
IQR,26.0,69158.25,123091.0,284.25,11.0825,24.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1.5Rule,39.0,103737.375,184636.5,426.375,16.62375,36.0,1.5,0.0,0.0,1.5,1.5,1.5,1.5,1.5,0.0,0.0,0.0,0.0,1.5
Lesser,-12.0,-58129.875,-109468.0,-5.625,-7.77375,-12.0,-1.5,0.0,0.0,-1.5,-1.5,-1.5,-1.5,-1.5,0.0,0.0,0.0,0.0,-1.5


In [41]:
#Finding Outliers
lesser_outlier = []
greater_outlier = []
for column in preprocessed_dataset_numerical.columns:
  
    if tble[column]["Minimum"] < tble[column]["Lesser"] :
        lesser_outlier.append(column)
    
    if tble[column]["Maximum"] > tble[column]["Greater"]:
        greater_outlier.append(column)     

print(lesser_outlier)
print(greater_outlier)

[]
["Education_Master's", 'Education_PhD', 'LoanPurpose_Business', 'LoanPurpose_Education', 'LoanPurpose_Home', 'LoanPurpose_Other']


In [42]:
#Removing the Outliers by replacing the lesser and greater values
for column in lesser_outlier:
    preprocessed_dataset_numerical[column][preprocessed_dataset_numerical[column] < tble[column]["Lesser"]] = tble[column]["Lesser"]

for column in greater_outlier: 
    preprocessed_dataset_numerical[column][preprocessed_dataset_numerical[column] > tble[column]["Greater"]] = tble[column]["Greater"]

In [43]:
tble = obj.getUnivarateTbl(preprocessed_dataset_numerical, preprocessed_dataset_numerical.columns)
tble

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Education_High School,Education_Master's,Education_PhD,EmploymentType_Part-time,EmploymentType_Self-employed,EmploymentType_Unemployed,MaritalStatus_Married,MaritalStatus_Single,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other,Default
Mean,40.327,80912.5205,135065.545,564.9695,14.325215,36.048,0.2645,0.0,0.0,0.2535,0.2505,0.263,0.3265,0.324,0.0,0.0,0.0,0.0,0.5
Median,38.0,81405.5,139448.5,560.0,14.76,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
Mode,27.0,15098.0,22099.0,309.0,15.87,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q1:25%,27.0,45607.5,75168.5,420.75,8.85,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q2:50%,38.0,81405.5,139448.5,560.0,14.76,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
Q3:75%,53.0,114765.75,198259.5,705.0,19.9325,48.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
Q4:100%,69.0,149986.0,249912.0,849.0,24.99,60.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
IQR,26.0,69158.25,123091.0,284.25,11.0825,24.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1.5Rule,39.0,103737.375,184636.5,426.375,16.62375,36.0,1.5,0.0,0.0,1.5,1.5,1.5,1.5,1.5,0.0,0.0,0.0,0.0,1.5
Lesser,-12.0,-58129.875,-109468.0,-5.625,-7.77375,-12.0,-1.5,0.0,0.0,-1.5,-1.5,-1.5,-1.5,-1.5,0.0,0.0,0.0,0.0,-1.5


In [44]:
#Cross checking Outliers
lesser_outlier = []
greater_outlier = []
for column in preprocessed_dataset_numerical.columns:
  
    if tble[column]["Minimum"] < tble[column]["Lesser"] :
        lesser_outlier.append(column)
    
    if tble[column]["Maximum"] > tble[column]["Greater"]:
        greater_outlier.append(column)     

print(lesser_outlier)
print(greater_outlier)

[]
[]


In [45]:
preprocessed_dataset_numerical.isnull().sum()

Age                             0
Income                          0
LoanAmount                      0
CreditScore                     0
InterestRate                    0
LoanTerm                        0
Education_High School           0
Education_Master's              0
Education_PhD                   0
EmploymentType_Part-time        0
EmploymentType_Self-employed    0
EmploymentType_Unemployed       0
MaritalStatus_Married           0
MaritalStatus_Single            0
LoanPurpose_Business            0
LoanPurpose_Education           0
LoanPurpose_Home                0
LoanPurpose_Other               0
Default                         0
dtype: int64

# Feature Selection

In [46]:
lg =[]
svm =[]
svm_nl =[]
knn =[]
dt =[]
nb =[]
rf =[]

#Splitting into training and testing datasets
def Split_To_Training_Testing(features):
    x_train, x_test, y_train, y_test = train_test_split(features,dependent,test_size = 0.20,random_state = 0)
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)     
    return x_train, x_test, y_train, y_test

#Select K Best Algorithm
def selectKBest(n):    
    kbest = SelectKBest(score_func  = chi2, k = n)
    kbestModel = kbest.fit(independent,dependent)
    features = kbest.transform(independent)    
    return build_model(features)    

#Classificaiton Models
def build_model(features):   
    x_train, x_test, y_train, y_test =  Split_To_Training_Testing(features)
    logistic_regression(x_train, x_test, y_train, y_test)    
    svm_linear(x_train, x_test, y_train, y_test)
    svm_non_linear(x_train, x_test, y_train, y_test)
    knn_regresssion(x_train, x_test, y_train, y_test)
    decisionTree(x_train, x_test, y_train, y_test)
    naive_baye(x_train, x_test, y_train, y_test)
    random_forest(x_train, x_test, y_train, y_test)  
    dataframe1 = generateTbl()
    return dataframe1 
    
def logistic_regression(x_train, x_test, y_train, y_test):     
    global lg
    lg.clear()
    lg_regression = LogisticRegression(random_state = 42)
    lg_regression.fit(x_train,y_train)
    y_pred = lg_regression.predict(x_test)
    lg_cm = confusion_matrix(y_test, y_pred)
    lg_accuracy = accuracy_score(y_test, y_pred )
    lg_classification = classification_report(y_test, y_pred)    
    lg.append(lg_accuracy)   

def svm_linear(x_train, x_test, y_train, y_test):
    global lg
    svm.clear()
    svm_regression = SVC(kernel = 'linear', random_state = 0)
    svm_regression.fit(x_train,y_train)
    y_pred = svm_regression.predict(x_test)
    svm_cm = confusion_matrix(y_test, y_pred)
    svm_accuracy = accuracy_score(y_test, y_pred )
    svm_classification = classification_report(y_test, y_pred)
    svm.append(svm_accuracy)
    
def svm_non_linear(x_train, x_test, y_train, y_test):
    global svm_nl
    svm_nl.clear()
    svmnl_regression = SVC(kernel = 'rbf', random_state = 0)
    svmnl_regression.fit(x_train,y_train)
    y_pred = svmnl_regression.predict(x_test)
    svmnl_cm = confusion_matrix(y_test, y_pred)
    svmnl_accuracy = accuracy_score(y_test, y_pred )
    svmnl_classification = classification_report(y_test, y_pred)
    svm_nl.append(svmnl_accuracy)
     
    
def knn_regresssion(x_train, x_test, y_train, y_test):
    global knn
    knn.clear()
    knn_regression =  KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    knn_regression.fit(x_train,y_train)
    y_pred = knn_regression.predict(x_test)
    knn_cm = confusion_matrix(y_test, y_pred)
    knn_accuracy = accuracy_score(y_test, y_pred )
    knn_classification = classification_report(y_test, y_pred)  
    knn.append(knn_accuracy)

    
def decisionTree(x_train, x_test, y_train, y_test):  
    global dt
    dt.clear()
    des_regression =  DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    des_regression.fit(x_train,y_train)
    y_pred = des_regression.predict(x_test)
    des_cm = confusion_matrix(y_test, y_pred)
    des_accuracy = accuracy_score(y_test, y_pred )
    des_classification = classification_report(y_test, y_pred)
    dt.append(des_accuracy)    
    
def naive_baye(x_train, x_test, y_train, y_test):   
    global nb
    nb.clear()
    nav_regression = GaussianNB()
    nav_regression.fit(x_train,y_train)
    y_pred = nav_regression.predict(x_test)
    nav_cm = confusion_matrix(y_test, y_pred)
    nav_accuracy = accuracy_score(y_test, y_pred )
    nav_classification = classification_report(y_test, y_pred)
    nb.append(nav_accuracy)        

    
def random_forest(x_train, x_test, y_train, y_test):
    global rf
    rf.clear()
    rf_regression = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    rf_regression.fit(x_train,y_train)
    y_pred = rf_regression.predict(x_test)
    rf_cm = confusion_matrix(y_test, y_pred)
    rf_accuracy = accuracy_score(y_test, y_pred )
    rf_classification = classification_report(y_test, y_pred)
    rf.append(rf_accuracy)     

def generateTbl():
    result=dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Logistic','SVMl','SVMnl','KNN','Navie','Decision','Random'])
    for number,index in enumerate(dataframe.index):   
        dataframe['Logistic'][index]=lg[number]       
        dataframe['SVMl'][index]=svm[number]        
        dataframe['SVMnl'][index]=svm_nl[number]
        dataframe['KNN'][index]=knn[number]
        dataframe['Navie'][index]=nb[number]
        dataframe['Decision'][index]=dt[number]
        dataframe['Random'][index]=rf[number]
    return dataframe     
        

In [47]:
# K = 7
dataframe = selectKBest(7)
dataframe

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.6375,0.6375,0.6525,0.6175,0.64,0.555,0.6325


In [48]:
# K = 15
dataframe = selectKBest(15)
dataframe

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.625,0.62,0.615,0.5775,0.61,0.56,0.5725


# Using Grid Search CV

In [49]:
lg =[]
svm =[]
knn =[]
dt =[]
nb =[]
rf =[]

#Select K Best Algorithm
def selectKBestGrid(n):    
    kbest = SelectKBest(score_func  = chi2, k = n)
    kbestModel = kbest.fit(independent,dependent)
    features = kbest.transform(independent)    
    return build_modelGrid(features)    
 
#Classificaiton Models

def build_modelGrid(features):   
    x_train, x_test, y_train, y_test =  Split_To_Training_Testing(features)
    logistic_regression(x_train, x_test, y_train, y_test)    
    svm_linear(x_train, x_test, y_train, y_test)
    svm_non_linear(x_train, x_test, y_train, y_test)
    knn_regresssion(x_train, x_test, y_train, y_test)
    decisionTree(x_train, x_test, y_train, y_test)
    naive_baye(x_train, x_test, y_train, y_test)
    random_forest(x_train, x_test, y_train, y_test)  
    dataframe1 = generateTbl()
    return dataframe1 
    
def logistic_regression(x_train, x_test, y_train, y_test):     
    global lg
    lg.clear()
    lg_params = {       
        "C"   : [1.0,5.0,10.0],   
    }
    
    lg_classifier = GridSearchCV(LogisticRegression(),lg_params, refit = True)  
    lg_classifier.fit(x_train,y_train)     
    print(f"Logistic Regression : {lg_classifier.best_params_}")
    y_pred = lg_classifier.predict(x_test)
    lg_cm = confusion_matrix(y_test, y_pred)
    lg_accuracy = accuracy_score(y_test, y_pred )
    lg_classification = classification_report(y_test, y_pred)  
    lg.append(lg_accuracy)   

def svm_linear(x_train, x_test, y_train, y_test):
    global lg
    svm.clear()
    svc_params = {   
       "kernel" : ["rbf","sigmoid","linear"],   
       "C"      : [100,150],     
       "gamma"   : ["scale", "auto"]
    }
    svc_classifier = GridSearchCV(SVC(),svc_params, refit = True)  
    svc_classifier.fit(x_train,y_train)     
    print(f"SVM : {svc_classifier.best_params_}")
    y_pred = svc_classifier.predict(x_test)
    svm_cm = confusion_matrix(y_test, y_pred)
    svm_accuracy = accuracy_score(y_test, y_pred )
    svm_classification = classification_report(y_test, y_pred)
    svm.append(svm_accuracy) 
    
def knn_regresssion(x_train, x_test, y_train, y_test):
    global knn
    knn.clear()
    knn_params = {    
        "n_neighbors" : [5,10,100,150,200],
        "algorithm"   : ["auto", "ball_tree", "kd_tree", "brute"],
        "metric" : ["minkowski"]    
    }
    
    knn_classifier = GridSearchCV(KNeighborsClassifier(),knn_params, refit = True)  
    knn_classifier.fit(x_train,y_train) 
    print(f"KNN : {knn_classifier.best_params_}")    
    y_pred = knn_classifier.predict(x_test)
    knn_cm = confusion_matrix(y_test, y_pred)
    knn_accuracy = accuracy_score(y_test, y_pred )
    knn_classification = classification_report(y_test, y_pred)        
    knn.append(knn_accuracy)

    
def decisionTree(x_train, x_test, y_train, y_test):  
    global dt
    dt.clear()
    dt_params = {
        "criterion"    : ["gini", "entropy", "log_loss"],
        "splitter"     : ["best", "random"],
        "max_features" : ["sqrt", "log2"]
    }    
    dt_classifier = GridSearchCV(DecisionTreeClassifier(),dt_params, refit = True)  
    dt_classifier.fit(x_train,y_train) 
    print(f"Decision Tree : {dt_classifier.best_params_}")        
    y_pred = dt_classifier.predict(x_test)
    des_cm = confusion_matrix(y_test, y_pred)
    des_accuracy = accuracy_score(y_test, y_pred )
    des_classification = classification_report(y_test, y_pred)   
    dt.append(des_accuracy)    
    
def naive_baye(x_train, x_test, y_train, y_test):   
    global nb
    nb.clear()
    nav_regression = GaussianNB()
    nav_regression.fit(x_train,y_train)
    y_pred = nav_regression.predict(x_test)
    nav_cm = confusion_matrix(y_test, y_pred)
    nav_accuracy = accuracy_score(y_test, y_pred )
    nav_classification = classification_report(y_test, y_pred)    
    nb.append(nav_accuracy)        

    
def random_forest(x_train, x_test, y_train, y_test):
    global rf
    rf.clear()
    rf_params = {
        "criterion"    : ["gini", "entropy", "log_loss"],
        "n_estimators" : [50,100,250,500,1000],
        "max_features" : ["sqrt", "log2",None]
    }    
    rf_classifier = GridSearchCV(RandomForestClassifier(),rf_params, refit = True)  
    rf_classifier.fit(x_train,y_train)
    print(f"Random Forest : {rf_classifier.best_params_}")    
    print(rf_classifier.best_params_) 
    y_pred = rf_classifier.predict(x_test)
    rf_cm = confusion_matrix(y_test, y_pred)
    rf_accuracy = accuracy_score(y_test, y_pred )
    rf_classification = classification_report(y_test, y_pred)
    rf.append(rf_accuracy)     

def generateTbl():
    result=dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Logistic','SVM','KNN','Navie','Decision','Random'])
    for number,index in enumerate(dataframe.index):   
        dataframe['Logistic'][index]=lg[number]       
        dataframe['SVM'][index]=svm[number]     
        dataframe['KNN'][index]=knn[number]
        dataframe['Navie'][index]=nb[number]
        dataframe['Decision'][index]=dt[number]
        dataframe['Random'][index]=rf[number]
    return dataframe     
        

In [50]:
#k = 7
dataframe = selectKBestGrid(7)
dataframe

Logistic Regression : {'C': 1.0}
SVM : {'C': 100, 'gamma': 'scale', 'kernel': 'linear'}
KNN : {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 150}
Decision Tree : {'criterion': 'log_loss', 'max_features': 'sqrt', 'splitter': 'best'}
Random Forest : {'criterion': 'log_loss', 'max_features': 'sqrt', 'n_estimators': 1000}
{'criterion': 'log_loss', 'max_features': 'sqrt', 'n_estimators': 1000}


Unnamed: 0,Logistic,SVM,KNN,Navie,Decision,Random
ChiSquare,0.6375,0.6375,0.635,0.64,0.5725,0.6425


In [51]:
#k = 10
dataframe = selectKBestGrid(10)
dataframe

Logistic Regression : {'C': 1.0}
SVM : {'C': 100, 'gamma': 'scale', 'kernel': 'linear'}
KNN : {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 100}
Decision Tree : {'criterion': 'log_loss', 'max_features': 'log2', 'splitter': 'best'}
Random Forest : {'criterion': 'entropy', 'max_features': None, 'n_estimators': 100}
{'criterion': 'entropy', 'max_features': None, 'n_estimators': 100}


Unnamed: 0,Logistic,SVM,KNN,Navie,Decision,Random
ChiSquare,0.625,0.62,0.63,0.6,0.54,0.615


In [52]:
#k = 15
dataframe = selectKBestGrid(15)
dataframe

Logistic Regression : {'C': 1.0}
SVM : {'C': 100, 'gamma': 'scale', 'kernel': 'linear'}
KNN : {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 100}
Decision Tree : {'criterion': 'log_loss', 'max_features': 'log2', 'splitter': 'best'}
Random Forest : {'criterion': 'log_loss', 'max_features': 'sqrt', 'n_estimators': 100}
{'criterion': 'log_loss', 'max_features': 'sqrt', 'n_estimators': 100}


Unnamed: 0,Logistic,SVM,KNN,Navie,Decision,Random
ChiSquare,0.625,0.62,0.61,0.61,0.5625,0.6475
