In [1]:
import warnings
import numpy as np
import pandas as pd
import pickle
from utility import Univariate
from sklearn.utils import shuffle  
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

In [2]:
warnings.filterwarnings("ignore")

In [3]:
dataset = pd.read_csv("Loan_default.csv")

In [4]:
dataset

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56.0,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69.0,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46.0,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32.0,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60.0,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255342,8C6S86ESGC,19.0,37979,210682,541,109,4,14.11,12,0.85,Bachelor's,Full-time,Married,No,No,Other,No,0
255343,98R4KDHNND,32.0,51953,189899,511,14,2,11.55,24,0.21,High School,Part-time,Divorced,No,No,Home,No,1
255344,XQK1UUUNGP,56.0,84820,208294,597,70,3,5.29,60,0.50,High School,Self-employed,Married,Yes,Yes,Auto,Yes,0
255345,JAO28CPL4H,42.0,85109,60575,809,40,1,20.90,48,0.44,High School,Part-time,Single,Yes,Yes,Other,No,0


In [5]:

dataset.drop(["LoanID","MonthsEmployed","NumCreditLines","DTIRatio",
               "HasMortgage","HasDependents","HasCoSigner"],axis = 1,inplace = True)

In [None]:
# dataset.drop(["LoanID"],axis = 1,inplace = True)

In [6]:
dataset

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Education,EmploymentType,MaritalStatus,LoanPurpose,Default
0,56.0,85994,50587,520,15.23,36,Bachelor's,Full-time,Divorced,Other,0
1,69.0,50432,124440,458,4.81,60,Master's,Full-time,Married,Other,0
2,46.0,84208,129188,451,21.17,24,Master's,Unemployed,Divorced,Auto,1
3,32.0,31713,44799,743,7.07,24,High School,Full-time,Married,Business,0
4,60.0,20437,9139,633,6.51,48,Bachelor's,Unemployed,Divorced,Auto,0
...,...,...,...,...,...,...,...,...,...,...,...
255342,19.0,37979,210682,541,14.11,12,Bachelor's,Full-time,Married,Other,0
255343,32.0,51953,189899,511,11.55,24,High School,Part-time,Divorced,Home,1
255344,56.0,84820,208294,597,5.29,60,High School,Self-employed,Married,Auto,0
255345,42.0,85109,60575,809,20.90,48,High School,Part-time,Single,Other,0


In [7]:
dataset.describe()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Default
count,255344.0,255347.0,255347.0,255347.0,255347.0,255347.0,255347.0
mean,43.498473,82499.304597,127578.865512,574.264346,13.492773,36.025894,0.116128
std,14.990245,38963.013729,70840.706142,158.903867,6.636443,16.96933,0.320379
min,18.0,15000.0,5000.0,300.0,2.0,12.0,0.0
25%,31.0,48825.5,66156.0,437.0,7.77,24.0,0.0
50%,43.0,82466.0,127556.0,574.0,13.46,36.0,0.0
75%,56.0,116219.0,188985.0,712.0,19.25,48.0,0.0
max,69.0,149999.0,249999.0,849.0,25.0,60.0,1.0


In [8]:
dataset["Default"].value_counts()

Default
0    225694
1     29653
Name: count, dtype: int64

In [9]:
yes_data = dataset[dataset["Default"] == 1]
no_data  = dataset[dataset["Default"] == 0]

In [10]:
min_size = min(len(yes_data), len(no_data))
min_size

29653

In [11]:
min_yes_data = yes_data.sample(min_size,random_state = 42)
min_no_data  = no_data.sample(min_size,random_state = 42)

In [12]:
balanced_data = pd.concat([min_yes_data,min_no_data])

In [13]:
balanced_data=  shuffle(balanced_data, random_state=42)

In [14]:
final_yes = balanced_data[balanced_data['Default'] == 1].sample(n=25000, random_state=42)
final_no = balanced_data[balanced_data['Default'] == 0].sample(n=25000, random_state=42)

In [15]:
new_reduced_data = pd.concat([final_yes, final_no])

In [16]:
new_reduced_data = shuffle(new_reduced_data, random_state = 42)

In [17]:
new_reduced_data["Default"].value_counts()

Default
0    25000
1    25000
Name: count, dtype: int64

In [18]:
balanced_dataset = new_reduced_data

# Preprocessing

In [19]:
balanced_dataset.isnull().sum()

Age               1
Income            0
LoanAmount        0
CreditScore       0
InterestRate      0
LoanTerm          0
Education         0
EmploymentType    0
MaritalStatus     0
LoanPurpose       1
Default           0
dtype: int64

In [20]:
balanced_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50000 entries, 185261 to 154607
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             49999 non-null  float64
 1   Income          50000 non-null  int64  
 2   LoanAmount      50000 non-null  int64  
 3   CreditScore     50000 non-null  int64  
 4   InterestRate    50000 non-null  float64
 5   LoanTerm        50000 non-null  int64  
 6   Education       50000 non-null  object 
 7   EmploymentType  50000 non-null  object 
 8   MaritalStatus   50000 non-null  object 
 9   LoanPurpose     49999 non-null  object 
 10  Default         50000 non-null  int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 4.6+ MB


In [21]:
independent = balanced_dataset.drop("Default", axis = 1)
dependent = balanced_dataset["Default"]

In [22]:
independent

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Education,EmploymentType,MaritalStatus,LoanPurpose
185261,57.0,109874,30973,567,13.95,60,Master's,Part-time,Divorced,Auto
200305,69.0,34006,112148,597,10.60,24,Master's,Part-time,Married,Home
231905,20.0,36779,135741,352,23.82,36,Bachelor's,Full-time,Divorced,Auto
148636,41.0,50628,82083,828,11.32,48,High School,Self-employed,Divorced,Auto
114870,28.0,114840,246968,415,22.65,24,High School,Self-employed,Divorced,Home
...,...,...,...,...,...,...,...,...,...,...
178659,20.0,143934,105532,521,14.66,24,Bachelor's,Self-employed,Married,Business
102536,47.0,139079,102610,669,17.28,24,PhD,Unemployed,Divorced,Auto
18591,32.0,106367,85207,466,2.44,36,Bachelor's,Self-employed,Divorced,Business
52610,29.0,94936,45806,313,24.81,48,PhD,Full-time,Single,Home


In [23]:
#Splitting into Numerical and Categorical Data
numerical_data = independent.select_dtypes(include = [np.number])
categorical_data = independent.select_dtypes(exclude = [np.number])

In [24]:
categorical_data

Unnamed: 0,Education,EmploymentType,MaritalStatus,LoanPurpose
185261,Master's,Part-time,Divorced,Auto
200305,Master's,Part-time,Married,Home
231905,Bachelor's,Full-time,Divorced,Auto
148636,High School,Self-employed,Divorced,Auto
114870,High School,Self-employed,Divorced,Home
...,...,...,...,...
178659,Bachelor's,Self-employed,Married,Business
102536,PhD,Unemployed,Divorced,Auto
18591,Bachelor's,Self-employed,Divorced,Business
52610,PhD,Full-time,Single,Home


In [25]:
numerical_data

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm
185261,57.0,109874,30973,567,13.95,60
200305,69.0,34006,112148,597,10.60,24
231905,20.0,36779,135741,352,23.82,36
148636,41.0,50628,82083,828,11.32,48
114870,28.0,114840,246968,415,22.65,24
...,...,...,...,...,...,...
178659,20.0,143934,105532,521,14.66,24
102536,47.0,139079,102610,669,17.28,24
18591,32.0,106367,85207,466,2.44,36
52610,29.0,94936,45806,313,24.81,48


In [26]:
imputer = SimpleImputer(strategy = "mean")

In [27]:
imputer.fit(numerical_data[::])

In [28]:
numerical_filled_data = pd.DataFrame(imputer.fit_transform(numerical_data),
                                      columns = numerical_data.columns)

In [29]:
numerical_filled_data.isnull().sum()

Age             0
Income          0
LoanAmount      0
CreditScore     0
InterestRate    0
LoanTerm        0
dtype: int64

In [30]:
categorical_imputer = SimpleImputer(strategy = "most_frequent")

In [31]:
categorical_filled_data = pd.DataFrame(categorical_imputer.fit_transform(categorical_data),
                                        columns = categorical_data.columns)

In [32]:
categorical_filled_data.isnull().sum()

Education         0
EmploymentType    0
MaritalStatus     0
LoanPurpose       0
dtype: int64

In [33]:
independent = pd.concat([numerical_filled_data,categorical_filled_data], axis = 1)

In [34]:
dependent


185261    0
200305    1
231905    1
148636    1
114870    0
         ..
178659    1
102536    0
18591     0
52610     1
154607    1
Name: Default, Length: 50000, dtype: int64

In [35]:
independent = independent.reset_index(drop=True)
dependent = dependent.reset_index(drop=True)

In [36]:
preprocessed_dataset = pd.concat([independent, dependent],axis = 1)
preprocessed_dataset

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Education,EmploymentType,MaritalStatus,LoanPurpose,Default
0,57.0,109874.0,30973.0,567.0,13.95,60.0,Master's,Part-time,Divorced,Auto,0
1,69.0,34006.0,112148.0,597.0,10.60,24.0,Master's,Part-time,Married,Home,1
2,20.0,36779.0,135741.0,352.0,23.82,36.0,Bachelor's,Full-time,Divorced,Auto,1
3,41.0,50628.0,82083.0,828.0,11.32,48.0,High School,Self-employed,Divorced,Auto,1
4,28.0,114840.0,246968.0,415.0,22.65,24.0,High School,Self-employed,Divorced,Home,0
...,...,...,...,...,...,...,...,...,...,...,...
49995,20.0,143934.0,105532.0,521.0,14.66,24.0,Bachelor's,Self-employed,Married,Business,1
49996,47.0,139079.0,102610.0,669.0,17.28,24.0,PhD,Unemployed,Divorced,Auto,0
49997,32.0,106367.0,85207.0,466.0,2.44,36.0,Bachelor's,Self-employed,Divorced,Business,0
49998,29.0,94936.0,45806.0,313.0,24.81,48.0,PhD,Full-time,Single,Home,1


In [37]:
#Lable Encoding
encoder = LabelEncoder()

for column in categorical_filled_data:
    preprocessed_dataset[column] = encoder.fit_transform(preprocessed_dataset[column])


In [38]:
preprocessed_dataset

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Education,EmploymentType,MaritalStatus,LoanPurpose,Default
0,57.0,109874.0,30973.0,567.0,13.95,60.0,2,1,0,0,0
1,69.0,34006.0,112148.0,597.0,10.60,24.0,2,1,1,3,1
2,20.0,36779.0,135741.0,352.0,23.82,36.0,0,0,0,0,1
3,41.0,50628.0,82083.0,828.0,11.32,48.0,1,2,0,0,1
4,28.0,114840.0,246968.0,415.0,22.65,24.0,1,2,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...
49995,20.0,143934.0,105532.0,521.0,14.66,24.0,0,2,1,1,1
49996,47.0,139079.0,102610.0,669.0,17.28,24.0,3,3,0,0,0
49997,32.0,106367.0,85207.0,466.0,2.44,36.0,0,2,0,1,0
49998,29.0,94936.0,45806.0,313.0,24.81,48.0,3,0,2,3,1


In [39]:
# # Converting Categorical to Numerical Dataset
# independent = pd.get_dummies(independent,dtype= int,drop_first = True)
# independent

In [40]:
# preprocessed_dataset_numerical = pd.concat([independent, dependent],axis = 1)
# preprocessed_dataset_numerical

# Univariate Analysis

In [41]:
preprocessed_dataset.describe()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Education,EmploymentType,MaritalStatus,LoanPurpose,Default
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,40.527231,77895.91892,135393.1022,567.69586,14.546836,36.06624,1.46762,1.5539,0.99076,1.98012,0.5
std,14.950361,40100.315275,70732.184109,158.921488,6.623192,16.963983,1.114691,1.113732,0.823413,1.415586,0.500005
min,18.0,15004.0,5000.0,300.0,2.0,12.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,41732.0,75429.25,430.0,8.97,24.0,0.0,1.0,0.0,1.0,0.0
50%,39.0,76332.5,139267.0,565.0,15.055,36.0,1.0,2.0,1.0,2.0,0.5
75%,53.0,112750.25,197546.0,704.0,20.4,48.0,2.0,3.0,2.0,3.0,1.0
max,69.0,149999.0,249993.0,849.0,25.0,60.0,3.0,3.0,2.0,4.0,1.0


In [42]:
obj = Univariate()
tble = obj.getUnivarateTbl(preprocessed_dataset, preprocessed_dataset.columns)
tble

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Education,EmploymentType,MaritalStatus,LoanPurpose,Default
Mean,40.527231,77895.91892,135393.1022,567.69586,14.546836,36.06624,1.46762,1.5539,0.99076,1.98012,0.5
Median,39.0,76332.5,139267.0,565.0,15.055,36.0,1.0,2.0,1.0,2.0,0.5
Mode,22.0,15293.0,206276.0,513.0,21.93,60.0,1.0,3.0,0.0,1.0,0.0
Q1:25%,27.0,41732.0,75429.25,430.0,8.97,24.0,0.0,1.0,0.0,1.0,0.0
Q2:50%,39.0,76332.5,139267.0,565.0,15.055,36.0,1.0,2.0,1.0,2.0,0.5
Q3:75%,53.0,112750.25,197546.0,704.0,20.4,48.0,2.0,3.0,2.0,3.0,1.0
Q4:100%,69.0,149999.0,249993.0,849.0,25.0,60.0,3.0,3.0,2.0,4.0,1.0
IQR,26.0,71018.25,122116.75,274.0,11.43,24.0,2.0,2.0,2.0,2.0,1.0
1.5Rule,39.0,106527.375,183175.125,411.0,17.145,36.0,3.0,3.0,3.0,3.0,1.5
Lesser,-12.0,-64795.375,-107745.875,19.0,-8.175,-12.0,-3.0,-2.0,-3.0,-2.0,-1.5


In [43]:
#Finding Outliers
lesser_outlier = []
greater_outlier = []
for column in preprocessed_dataset.columns:
  
    if tble[column]["Minimum"] < tble[column]["Lesser"] :
        lesser_outlier.append(column)
    
    if tble[column]["Maximum"] > tble[column]["Greater"]:
        greater_outlier.append(column)     

print(lesser_outlier)
print(greater_outlier)

[]
[]


In [44]:
#Removing the Outliers by replacing the lesser and greater values
for column in lesser_outlier:
    preprocessed_dataset_numerical[column]
    [preprocessed_dataset_numerical[column] < tble[column]["Lesser"]] = tble[column]["Lesser"]

for column in greater_outlier: 
    preprocessed_dataset_numerical[column]
   [preprocessed_dataset_numerical[column] > tble[column]["Greater"]] = tble[column]["Greater"]

In [45]:
tble = obj.getUnivarateTbl(preprocessed_dataset, preprocessed_dataset.columns)
tble

Unnamed: 0,Age,Income,LoanAmount,CreditScore,InterestRate,LoanTerm,Education,EmploymentType,MaritalStatus,LoanPurpose,Default
Mean,40.527231,77895.91892,135393.1022,567.69586,14.546836,36.06624,1.46762,1.5539,0.99076,1.98012,0.5
Median,39.0,76332.5,139267.0,565.0,15.055,36.0,1.0,2.0,1.0,2.0,0.5
Mode,22.0,15293.0,206276.0,513.0,21.93,60.0,1.0,3.0,0.0,1.0,0.0
Q1:25%,27.0,41732.0,75429.25,430.0,8.97,24.0,0.0,1.0,0.0,1.0,0.0
Q2:50%,39.0,76332.5,139267.0,565.0,15.055,36.0,1.0,2.0,1.0,2.0,0.5
Q3:75%,53.0,112750.25,197546.0,704.0,20.4,48.0,2.0,3.0,2.0,3.0,1.0
Q4:100%,69.0,149999.0,249993.0,849.0,25.0,60.0,3.0,3.0,2.0,4.0,1.0
IQR,26.0,71018.25,122116.75,274.0,11.43,24.0,2.0,2.0,2.0,2.0,1.0
1.5Rule,39.0,106527.375,183175.125,411.0,17.145,36.0,3.0,3.0,3.0,3.0,1.5
Lesser,-12.0,-64795.375,-107745.875,19.0,-8.175,-12.0,-3.0,-2.0,-3.0,-2.0,-1.5


In [46]:
#Cross checking Outliers
lesser_outlier = []
greater_outlier = []
for column in preprocessed_dataset.columns:
  
    if tble[column]["Minimum"] < tble[column]["Lesser"] :
        lesser_outlier.append(column)
    
    if tble[column]["Maximum"] > tble[column]["Greater"]:
        greater_outlier.append(column)     

print(lesser_outlier)
print(greater_outlier)

[]
[]


In [47]:
preprocessed_dataset.isnull().sum()

Age               0
Income            0
LoanAmount        0
CreditScore       0
InterestRate      0
LoanTerm          0
Education         0
EmploymentType    0
MaritalStatus     0
LoanPurpose       0
Default           0
dtype: int64

In [48]:
independent = preprocessed_dataset.drop("Default", axis = 1)
dependent = preprocessed_dataset["Default"]

# Feature Selection

In [65]:
lg =[]
svm =[]
svm_nl =[]
knn =[]
dt =[]
nb =[]
rf =[]

data_to_save = {}
 
#Splitting into training and testing datasets
def Split_To_Training_Testing(features):
    x_train, x_test, y_train, y_test = train_test_split(features,dependent,test_size = 0.20,random_state = 0)
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)       
    data_to_save['scaler'] = sc
    return x_train, x_test, y_train, y_test

#Select K Best Algorithm
def selectKBest(n):    
    kbest = SelectKBest(score_func  = chi2, k = n)
    data_to_save['selector'] = kbest
    kbestModel = kbest.fit(independent,dependent)
    features = kbest.transform(independent)        
    print(independent.columns[kbest.get_support()])
    return build_model(features)    

#Classificaiton Models
def build_model(features):   
    x_train, x_test, y_train, y_test =  Split_To_Training_Testing(features)
    logistic_regression(x_train, x_test, y_train, y_test)    
    svm_linear(x_train, x_test, y_train, y_test)
    svm_non_linear(x_train, x_test, y_train, y_test)
    knn_regresssion(x_train, x_test, y_train, y_test)
    decisionTree(x_train, x_test, y_train, y_test)
    naive_baye(x_train, x_test, y_train, y_test)
    random_forest(x_train, x_test, y_train, y_test)  
    dataframe1 = generateTbl() 
    print("loading..")
    print(data_to_save)
    with open('model_data.pkl', 'wb') as f:
        pickle.dump(data_to_save, f) 
    return dataframe1 
    
def logistic_regression(x_train, x_test, y_train, y_test):     
    global lg
    lg.clear()
    lg_regression = LogisticRegression(random_state = 42)
    lg_regression.fit(x_train,y_train)
    y_pred = lg_regression.predict(x_test)
    lg_cm = confusion_matrix(y_test, y_pred)
    lg_accuracy = accuracy_score(y_test, y_pred )
    lg_classification = classification_report(y_test, y_pred)    
    lg.append(lg_accuracy)   

def svm_linear(x_train, x_test, y_train, y_test):
    global lg
    svm.clear()
    svm_regression = SVC(kernel = 'linear', random_state = 0)
    svm_regression.fit(x_train,y_train)
    y_pred = svm_regression.predict(x_test)
    svm_cm = confusion_matrix(y_test, y_pred)
    svm_accuracy = accuracy_score(y_test, y_pred )
    svm_classification = classification_report(y_test, y_pred)
    svm.append(svm_accuracy)
    
def svm_non_linear(x_train, x_test, y_train, y_test):
    global svm_nl
    svm_nl.clear()
    svmnl_regression = SVC(kernel = 'rbf', random_state = 0)
    svmnl_regression.fit(x_train,y_train)
    data_to_save['model'] = svmnl_regression
    y_pred = svmnl_regression.predict(x_test)
    svmnl_cm = confusion_matrix(y_test, y_pred)
    svmnl_accuracy = accuracy_score(y_test, y_pred )
    svmnl_classification = classification_report(y_test, y_pred)
    svm_nl.append(svmnl_accuracy)     
    
def knn_regresssion(x_train, x_test, y_train, y_test):
    global knn
    knn.clear()
    knn_regression =  KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    knn_regression.fit(x_train,y_train)
    y_pred = knn_regression.predict(x_test)
    knn_cm = confusion_matrix(y_test, y_pred)
    knn_accuracy = accuracy_score(y_test, y_pred )
    knn_classification = classification_report(y_test, y_pred)  
    knn.append(knn_accuracy)
    
def decisionTree(x_train, x_test, y_train, y_test):  
    global dt
    dt.clear()
    des_regression =  DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    des_regression.fit(x_train,y_train)
    y_pred = des_regression.predict(x_test)
    des_cm = confusion_matrix(y_test, y_pred)
    des_accuracy = accuracy_score(y_test, y_pred )
    des_classification = classification_report(y_test, y_pred)
    dt.append(des_accuracy)    
    
def naive_baye(x_train, x_test, y_train, y_test):   
    global nb
    nb.clear()
    nav_regression = GaussianNB()
    nav_regression.fit(x_train,y_train)
    y_pred = nav_regression.predict(x_test)
    nav_cm = confusion_matrix(y_test, y_pred)
    nav_accuracy = accuracy_score(y_test, y_pred )
    nav_classification = classification_report(y_test, y_pred)
    nb.append(nav_accuracy)        
    
def random_forest(x_train, x_test, y_train, y_test):
    global rf
    rf.clear()
    rf_regression = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    rf_regression.fit(x_train,y_train)
    y_pred = rf_regression.predict(x_test)
    rf_cm = confusion_matrix(y_test, y_pred)
    rf_accuracy = accuracy_score(y_test, y_pred )
    rf_classification = classification_report(y_test, y_pred)
    rf.append(rf_accuracy)     

def generateTbl():
    result=dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Logistic','SVMl','SVMnl','KNN','Navie','Decision','Random'])
    for number,index in enumerate(dataframe.index):   
        dataframe['Logistic'][index]=round(lg[number],2)
        dataframe['SVMl'][index]=round(svm[number],2) 
        dataframe['SVMnl'][index]=round(svm_nl[number],2) 
        dataframe['KNN'][index]=round(knn[number],2) 
        dataframe['Navie'][index]=round(nb[number],2) 
        dataframe['Decision'][index]=round(dt[number],2) 
        dataframe['Random'][index]=round(rf[number],2) 
 # Load the existing pickle file
    # with open('model_data.pkl', 'rb') as f:
    #     data_loaded = pickle.load(f)
    
    # # Add model to the dictionary
    # data_loaded['model'] = model
    
    # # Save the updated dictionary back
    # with open('model_data.pkl', 'wb') as f:
    #     pickle.dump(data_loaded, f)
        
    return dataframe     
        

In [66]:
# K = 5
dataframe = selectKBest(5)
dataframe

Index(['Age', 'Income', 'LoanAmount', 'CreditScore', 'InterestRate'], dtype='object')
loading..
{'selector': SelectKBest(k=5, score_func=<function chi2 at 0x0000021309486980>), 'scaler': StandardScaler(), 'model': SVC(random_state=0)}


Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.65,0.65,0.66,0.62,0.65,0.57,0.62


In [64]:
# K = 7
dataframe = selectKBest(7)
dataframe

Index(['Age', 'Income', 'LoanAmount', 'CreditScore', 'InterestRate',
       'Education', 'EmploymentType'],
      dtype='object')


Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.66,0.66,0.66,0.61,0.66,0.57,0.62


In [52]:
# K = 15
dataframe = selectKBest(15)
dataframe

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.66,0.66,0.66,0.61,0.66,0.58,0.63


In [53]:
# K = 18
dataframe = selectKBest(18)
dataframe

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.66,0.66,0.66,0.61,0.66,0.58,0.63


# Using Grid Search CV

In [54]:
# lg =[]
# svm =[]
# knn =[]
# dt =[]
# nb =[]
# rf =[]

# #Select K Best Algorithm
# def selectKBestGrid(n):    
#     kbest = SelectKBest(score_func  = chi2, k = n)
#     kbestModel = kbest.fit(independent,dependent)
#     features = kbest.transform(independent)    
#     return build_modelGrid(features)    
 
# #Classificaiton Models

# def build_modelGrid(features):   
#     x_train, x_test, y_train, y_test =  Split_To_Training_Testing(features)
#     logistic_regression(x_train, x_test, y_train, y_test)    
#     svm_linear(x_train, x_test, y_train, y_test)
#     svm_non_linear(x_train, x_test, y_train, y_test)
#     knn_regresssion(x_train, x_test, y_train, y_test)
#     decisionTree(x_train, x_test, y_train, y_test)
#     naive_baye(x_train, x_test, y_train, y_test)
#     random_forest(x_train, x_test, y_train, y_test)  
#     dataframe1 = generateTbl()
#     return dataframe1 
    
# def logistic_regression(x_train, x_test, y_train, y_test):     
#     global lg
#     lg.clear()
#     lg_params = {       
#         "C"   : [1.0,5.0,10.0],   
#     }
    
#     lg_classifier = GridSearchCV(LogisticRegression(),lg_params, refit = True)  
#     lg_classifier.fit(x_train,y_train)     
#     print(f"Logistic Regression : {lg_classifier.best_params_}")
#     y_pred = lg_classifier.predict(x_test)
#     lg_cm = confusion_matrix(y_test, y_pred)
#     lg_accuracy = accuracy_score(y_test, y_pred )
#     lg_classification = classification_report(y_test, y_pred)  
#     lg.append(lg_accuracy)   

# def svm_linear(x_train, x_test, y_train, y_test):
#     global lg
#     svm.clear()
#     svc_params = {   
#        "kernel" : ["rbf","sigmoid","linear"],   
#        "C"      : [100,150],     
#        "gamma"   : ["scale", "auto"]
#     }
#     svc_classifier = GridSearchCV(SVC(),svc_params, refit = True)  
#     svc_classifier.fit(x_train,y_train)     
#     print(f"SVM : {svc_classifier.best_params_}")
#     y_pred = svc_classifier.predict(x_test)
#     svm_cm = confusion_matrix(y_test, y_pred)
#     svm_accuracy = accuracy_score(y_test, y_pred )
#     svm_classification = classification_report(y_test, y_pred)
#     svm.append(svm_accuracy) 
    
# def knn_regresssion(x_train, x_test, y_train, y_test):
#     global knn
#     knn.clear()
#     knn_params = {    
#         "n_neighbors" : [5,10,100,150,200],
#         "algorithm"   : ["auto", "ball_tree", "kd_tree", "brute"],
#         "metric" : ["minkowski"]    
#     }
    
#     knn_classifier = GridSearchCV(KNeighborsClassifier(),knn_params, refit = True)  
#     knn_classifier.fit(x_train,y_train) 
#     print(f"KNN : {knn_classifier.best_params_}")    
#     y_pred = knn_classifier.predict(x_test)
#     knn_cm = confusion_matrix(y_test, y_pred)
#     knn_accuracy = accuracy_score(y_test, y_pred )
#     knn_classification = classification_report(y_test, y_pred)        
#     knn.append(knn_accuracy)

    
# def decisionTree(x_train, x_test, y_train, y_test):  
#     global dt
#     dt.clear()
#     dt_params = {
#         "criterion"    : ["gini", "entropy", "log_loss"],
#         "splitter"     : ["best", "random"],
#         "max_features" : ["sqrt", "log2"]
#     }    
#     dt_classifier = GridSearchCV(DecisionTreeClassifier(),dt_params, refit = True)  
#     dt_classifier.fit(x_train,y_train) 
#     print(f"Decision Tree : {dt_classifier.best_params_}")        
#     y_pred = dt_classifier.predict(x_test)
#     des_cm = confusion_matrix(y_test, y_pred)
#     des_accuracy = accuracy_score(y_test, y_pred )
#     des_classification = classification_report(y_test, y_pred)   
#     dt.append(des_accuracy)    
    
# def naive_baye(x_train, x_test, y_train, y_test):   
#     global nb
#     nb.clear()
#     nav_regression = GaussianNB()
#     nav_regression.fit(x_train,y_train)
#     y_pred = nav_regression.predict(x_test)
#     nav_cm = confusion_matrix(y_test, y_pred)
#     nav_accuracy = accuracy_score(y_test, y_pred )
#     nav_classification = classification_report(y_test, y_pred)    
#     nb.append(nav_accuracy)        

    
# def random_forest(x_train, x_test, y_train, y_test):
#     global rf
#     rf.clear()
#     rf_params = {
#         "criterion"    : ["gini", "entropy", "log_loss"],
#         "n_estimators" : [50,100,250,500,1000],
#         "max_features" : ["sqrt", "log2",None]
#     }    
#     rf_classifier = GridSearchCV(RandomForestClassifier(),rf_params, refit = True)  
#     rf_classifier.fit(x_train,y_train)
#     print(f"Random Forest : {rf_classifier.best_params_}")    
#     print(rf_classifier.best_params_) 
#     y_pred = rf_classifier.predict(x_test)
#     rf_cm = confusion_matrix(y_test, y_pred)
#     rf_accuracy = accuracy_score(y_test, y_pred )
#     rf_classification = classification_report(y_test, y_pred)
#     rf.append(rf_accuracy)     

# def generateTbl():
#     result=dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Logistic','SVM','KNN','Navie','Decision','Random'])
#     for number,index in enumerate(dataframe.index):   
#         dataframe['Logistic'][index]=lg[number]       
#         dataframe['SVM'][index]=svm[number]     
#         dataframe['KNN'][index]=knn[number]
#         dataframe['Navie'][index]=nb[number]
#         dataframe['Decision'][index]=dt[number]
#         dataframe['Random'][index]=rf[number]
#     return dataframe     
        

In [55]:
# #k = 7
# dataframe = selectKBestGrid(7)
# dataframe

In [56]:
# #k = 10
# dataframe = selectKBestGrid(10)
# dataframe

In [57]:
# #k = 15
# dataframe = selectKBestGrid(15)
# dataframe