In [1]:
import numpy as np
import warnings
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
warnings.filterwarnings("ignore")

In [3]:
dataset = pd.read_csv("Preprocessed_Dataset_Numerical.csv")

In [4]:
dataset

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education_High School,...,MaritalStatus_Married,MaritalStatus_Single,HasMortgage_Yes,HasDependents_Yes,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other,HasCoSigner_Yes,Default
0,48.0,90754.0,5902.0,462.0,91.0,2.0,10.46,60.0,0.80,0,...,1,0,0,0,0,0,0,0,1,0
1,24.0,59193.0,90049.0,309.0,73.0,1.0,20.85,36.0,0.88,1,...,0,1,1,0,0,0,0,0,1,0
2,55.0,77343.0,204141.0,423.0,19.0,2.0,17.66,48.0,0.49,0,...,0,0,1,0,0,0,0,0,1,0
3,18.0,25087.0,15645.0,474.0,23.0,4.0,20.95,48.0,0.58,0,...,0,0,0,0,0,0,0,0,1,0
4,31.0,33215.0,235022.0,771.0,67.0,3.0,10.53,36.0,0.90,0,...,0,1,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,59.0,127901.0,46482.0,424.0,44.0,4.0,15.27,60.0,0.58,0,...,0,0,0,1,0,0,0,0,0,1
996,54.0,139677.0,138362.0,780.0,27.0,3.0,11.82,24.0,0.49,0,...,0,1,1,0,0,0,0,0,0,1
997,42.0,53038.0,62256.0,763.0,116.0,3.0,2.55,36.0,0.54,0,...,0,0,1,0,0,0,0,0,1,0
998,43.0,137420.0,116698.0,424.0,21.0,4.0,9.43,36.0,0.85,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
independent = dataset.drop("Default",axis = 1)
dependent = dataset["Default"]

In [6]:
independent

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education_High School,...,EmploymentType_Unemployed,MaritalStatus_Married,MaritalStatus_Single,HasMortgage_Yes,HasDependents_Yes,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other,HasCoSigner_Yes
0,48.0,90754.0,5902.0,462.0,91.0,2.0,10.46,60.0,0.80,0,...,0,1,0,0,0,0,0,0,0,1
1,24.0,59193.0,90049.0,309.0,73.0,1.0,20.85,36.0,0.88,1,...,0,0,1,1,0,0,0,0,0,1
2,55.0,77343.0,204141.0,423.0,19.0,2.0,17.66,48.0,0.49,0,...,0,0,0,1,0,0,0,0,0,1
3,18.0,25087.0,15645.0,474.0,23.0,4.0,20.95,48.0,0.58,0,...,0,0,0,0,0,0,0,0,0,1
4,31.0,33215.0,235022.0,771.0,67.0,3.0,10.53,36.0,0.90,0,...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,59.0,127901.0,46482.0,424.0,44.0,4.0,15.27,60.0,0.58,0,...,0,0,0,0,1,0,0,0,0,0
996,54.0,139677.0,138362.0,780.0,27.0,3.0,11.82,24.0,0.49,0,...,0,0,1,1,0,0,0,0,0,0
997,42.0,53038.0,62256.0,763.0,116.0,3.0,2.55,36.0,0.54,0,...,0,0,0,1,0,0,0,0,0,1
998,43.0,137420.0,116698.0,424.0,21.0,4.0,9.43,36.0,0.85,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
dependent

0      0
1      0
2      0
3      0
4      1
      ..
995    1
996    1
997    0
998    1
999    1
Name: Default, Length: 1000, dtype: int64

In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Age                           1000 non-null   float64
 1   Income                        1000 non-null   float64
 2   LoanAmount                    1000 non-null   float64
 3   CreditScore                   1000 non-null   float64
 4   MonthsEmployed                1000 non-null   float64
 5   NumCreditLines                1000 non-null   float64
 6   InterestRate                  1000 non-null   float64
 7   LoanTerm                      1000 non-null   float64
 8   DTIRatio                      1000 non-null   float64
 9   Education_High School         1000 non-null   int64  
 10  Education_Master's            1000 non-null   int64  
 11  Education_PhD                 1000 non-null   int64  
 12  EmploymentType_Part-time      1000 non-null   int64  
 13  Empl

# Feature Selection

In [16]:
#Splitting into training and testing datasets
def Split_To_Training_Testing(features):
    x_train, x_test, y_train, y_test = train_test_split(features,dependent,test_size = 0.20,random_state = 0)
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)     
    return x_train, x_test, y_train, y_test


In [28]:
#Select K Best Algorithm
def selectKBest(n):    
    kbest = SelectKBest(score_func  = chi2, k = n)
    kbestModel = kbest.fit(independent,dependent)
    features = kbest.transform(independent)    
    return build_model(features)    

In [46]:
#Classificaiton Models
lg =[]
svm =[]
svm_nl =[]
knn =[]
dt =[]
nb =[]
rf =[]

def build_model(features):   
    x_train, x_test, y_train, y_test =  Split_To_Training_Testing(features)
    logistic_regression(x_train, x_test, y_train, y_test)    
    svm_linear(x_train, x_test, y_train, y_test)
    svm_non_linear(x_train, x_test, y_train, y_test)
    knn_regresssion(x_train, x_test, y_train, y_test)
    decisionTree(x_train, x_test, y_train, y_test)
    naive_baye(x_train, x_test, y_train, y_test)
    random_forest(x_train, x_test, y_train, y_test)  
    dataframe1 = generateTbl()
    return dataframe1 
    
def logistic_regression(x_train, x_test, y_train, y_test):     
    global lg
    lg.clear()
    lg_regression = LogisticRegression(random_state = 42)
    lg_regression.fit(x_train,y_train)
    y_pred = lg_regression.predict(x_test)
    lg_cm = confusion_matrix(y_test, y_pred)
    lg_accuracy = accuracy_score(y_test, y_pred )
    lg_classification = classification_report(y_test, y_pred)    
    # print("Logistic Regression")
    # print(lg_cm,"\n")    
    # print(lg_accuracy,"\n")    
    # print(lg_classification,"\n")   
    lg.append(lg_accuracy)   

def svm_linear(x_train, x_test, y_train, y_test):
    global lg
    svm.clear()
    svm_regression = SVC(kernel = 'linear', random_state = 0)
    svm_regression.fit(x_train,y_train)
    y_pred = svm_regression.predict(x_test)
    svm_cm = confusion_matrix(y_test, y_pred)
    svm_accuracy = accuracy_score(y_test, y_pred )
    svm_classification = classification_report(y_test, y_pred)
    # print("SVM Linear")
    # print(svm_cm,"\n")    
    # print(svm_accuracy,"\n")    
    # print(svm_classification,"\n")
    svm.append(svm_accuracy)
    
def svm_non_linear(x_train, x_test, y_train, y_test):
    global svm_nl
    svm_nl.clear()
    svmnl_regression = SVC(kernel = 'rbf', random_state = 0)
    svmnl_regression.fit(x_train,y_train)
    y_pred = svmnl_regression.predict(x_test)
    svmnl_cm = confusion_matrix(y_test, y_pred)
    svmnl_accuracy = accuracy_score(y_test, y_pred )
    svmnl_classification = classification_report(y_test, y_pred)
    # print("SVM NON Linear")
    # print(svmnl_cm,"\n")    
    # print(svmnl_accuracy,"\n")    
    # print(svmnl_classification,"\n")   
    svm_nl.append(svmnl_accuracy)
     
    
def knn_regresssion(x_train, x_test, y_train, y_test):
    global knn
    knn.clear()
    knn_regression =  KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    knn_regression.fit(x_train,y_train)
    y_pred = knn_regression.predict(x_test)
    knn_cm = confusion_matrix(y_test, y_pred)
    knn_accuracy = accuracy_score(y_test, y_pred )
    knn_classification = classification_report(y_test, y_pred)  
    # print("KNN")
    # print(knn_cm,"\n")    
    # print(knn_accuracy,"\n")    
    # print(knn_classification,"\n")    
    knn.append(knn_accuracy)

    
def decisionTree(x_train, x_test, y_train, y_test):  
    global dt
    dt.clear()
    des_regression =  DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    des_regression.fit(x_train,y_train)
    y_pred = des_regression.predict(x_test)
    des_cm = confusion_matrix(y_test, y_pred)
    des_accuracy = accuracy_score(y_test, y_pred )
    des_classification = classification_report(y_test, y_pred)
    # print("DT")
    # print(des_cm,"\n")    
    # print(des_accuracy,"\n")    
    # print(des_classification,"\n") 
    dt.append(des_accuracy)    
    
def naive_baye(x_train, x_test, y_train, y_test):   
    global nb
    nb.clear()
    nav_regression = GaussianNB()
    nav_regression.fit(x_train,y_train)
    y_pred = nav_regression.predict(x_test)
    nav_cm = confusion_matrix(y_test, y_pred)
    nav_accuracy = accuracy_score(y_test, y_pred )
    nav_classification = classification_report(y_test, y_pred)
    # print("Naive Baye")
    # print(nav_cm,"\n")    
    # print(nav_accuracy,"\n")    
    # print(nav_classification,"\n")   
    nb.append(nav_accuracy)        

    
def random_forest(x_train, x_test, y_train, y_test):
    global rf
    rf.clear()
    rf_regression = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    rf_regression.fit(x_train,y_train)
    y_pred = rf_regression.predict(x_test)
    rf_cm = confusion_matrix(y_test, y_pred)
    rf_accuracy = accuracy_score(y_test, y_pred )
    rf_classification = classification_report(y_test, y_pred)
    # print("Random Forest")
    # print(rf_cm,"\n")    
    # print(rf_accuracy,"\n")    
    # print(rf_classification,"\n")    
    rf.append(rf_accuracy)     

def generateTbl():
    result=dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Logistic','SVMl','SVMnl','KNN','Navie','Decision','Random'])
    for number,index in enumerate(dataframe.index):   
        dataframe['Logistic'][index]=lg[number]       
        dataframe['SVMl'][index]=svm[number]        
        dataframe['SVMnl'][index]=svm_nl[number]
        dataframe['KNN'][index]=knn[number]
        dataframe['Navie'][index]=nb[number]
        dataframe['Decision'][index]=dt[number]
        dataframe['Random'][index]=rf[number]
    return dataframe     
        

In [47]:
#k = 2
dataframe = selectKBest(2)
dataframe

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.54,0.53,0.49,0.535,0.52,0.605,0.535


In [48]:
# K = 5
dataframe = selectKBest(5)
dataframe

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.61,0.61,0.61,0.565,0.615,0.6,0.645


In [49]:
# K = 7
dataframe = selectKBest(7)
dataframe

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.635,0.645,0.62,0.56,0.66,0.61,0.615


In [50]:
# K = 10
dataframe = selectKBest(10)
dataframe

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.625,0.64,0.62,0.59,0.64,0.645,0.62
