#Run 5 Naïve Bayes techniques to find the best parameters and performance


In [54]:
import pandas as pd
from sklearn import preprocessing
from IPython.display import display, HTML

df_loan20K=pd.read_csv('Data_Loans_20K.csv', header=0)
df_loan200K=pd.read_csv('Data_Loans_200K.csv', header=0)

print(df_loan20K.shape)
print(df_loan200K.shape)

cols_loan20K = df_loan20K.columns
cols_loan200K = df_loan200K.columns

print('Column Name, DataTypes, MissingValues in Loan 20K CSV\n')
for i in cols_loan20K:
    print(i,',', df_loan20K[i].dtype , ',', df_loan20K[i].isnull().any())

##If Yes, fill in missing values by mean values or most frequent nominal values.

df_loan20K["Credit Score"].fillna(df_loan20K["Credit Score"].mean(), inplace=True)
df_loan20K["Annual Income"].fillna(df_loan20K["Annual Income"].mean(), inplace=True)
df_loan20K["Years in current job"].fillna(df_loan20K["Years in current job"].mode().iloc[0], inplace=True)
df_loan20K["Months since last delinquent"].fillna(df_loan20K["Months since last delinquent"].mean(), inplace=True)
df_loan20K["Bankruptcies"].fillna(df_loan20K["Bankruptcies"].mean(), inplace=True)
df_loan20K["Tax Liens"].fillna(df_loan20K["Tax Liens"].mean(), inplace=True)

print('\nColumn Name, DataTypes, MissingValues after filling with Mean and Frequent repeated nominal value in Loan 20K CSV\n')
for i in cols_loan20K:
    print(i,',', df_loan20K[i].dtype , ',', df_loan20K[i].isnull().any())    
    
print('Column Name, DataTypes, MissingValues in Loan 200K CSV\n')
for i in cols_loan200K:
    print(i,',', df_loan200K[i].dtype , ',', df_loan200K[i].isnull().any())

##If Yes, fill in missing values by mean values or most frequent nominal values.

df_loan200K["Credit Score"].fillna(df_loan200K["Credit Score"].mean(), inplace=True)
df_loan200K["Annual Income"].fillna(df_loan200K["Annual Income"].mean(), inplace=True)
df_loan200K["Years in current job"].fillna(df_loan200K["Years in current job"].mode().iloc[0], inplace=True)
df_loan200K["Months since last delinquent"].fillna(df_loan200K["Months since last delinquent"].mean(), inplace=True)
df_loan200K["Maximum Open Credit"].fillna(df_loan200K["Maximum Open Credit"].mean(), inplace=True)
df_loan200K["Bankruptcies"].fillna(df_loan200K["Bankruptcies"].mean(), inplace=True)
df_loan200K["Tax Liens"].fillna(df_loan200K["Tax Liens"].mean(), inplace=True)

print('\nColumn Name, DataTypes, MissingValues after filling with Mean and Frequent repeated nominal value in Loan 200K CSV\n')
for i in cols_loan200K:
    print(i,',', df_loan200K[i].dtype , ',', df_loan200K[i].isnull().any()) 
   
# encode labels
y = df_loan20K['Term'] # define label as nominal values
le = preprocessing.LabelEncoder()
le.fit(y)
y_encoded = le.transform(y) # encode nominal labels to integers #####################################
df_loan20K['Term'] = y_encoded

(20000, 17)
(200000, 18)
Column Name, DataTypes, MissingValues in Loan 20K CSV

Loan Status , object , False
Current Loan Amount , int64 , False
Term , object , False
Credit Score , float64 , True
Annual Income , float64 , True
Years in current job , object , True
Home Ownership , object , False
Purpose , object , False
Monthly Debt , float64 , False
Years of Credit History , float64 , False
Months since last delinquent , float64 , True
Number of Open Accounts , int64 , False
Number of Credit Problems , int64 , False
Current Credit Balance , int64 , False
Maximum Open Credit , int64 , False
Bankruptcies , float64 , True
Tax Liens , float64 , True

Column Name, DataTypes, MissingValues after filling with Mean and Frequent repeated nominal value in Loan 20K CSV

Loan Status , object , False
Current Loan Amount , int64 , False
Term , object , False
Credit Score , float64 , False
Annual Income , float64 , False
Years in current job , object , False
Home Ownership , object , False
Purpose ,

In [55]:
# Data preprocessing ################################################################################
print('Column Datatypes:\n',df_loan20K.dtypes)

# convert all nominal variables to binary variables
df_num=df_loan20K.copy(deep=True) 
# create new binary columns
df_dummies=pd.get_dummies(df_num[['Loan Status','Years in current job','Home Ownership','Purpose']])
# add them to dataframe
df_num=df_num.join(df_dummies)
# drop original columns
df_num=df_num.drop('Loan Status',axis=1)
df_num=df_num.drop('Years in current job',axis=1)
df_num=df_num.drop('Home Ownership', axis=1)
df_num=df_num.drop('Purpose', axis=1)

# drop extra binary columns, since we only need N-1 binary columns
df_num=df_num.drop('Loan Status_Charged Off', axis=1)
df_num=df_num.drop('Years in current job_10+ years', axis=1)
df_num=df_num.drop('Home Ownership_Own Home', axis=1)
df_num=df_num.drop('Purpose_wedding', axis=1)

# print out and display dataframe as table in HTML
display(HTML(df_num.head(10).to_html()))

# standarized data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_num_std = df_num.copy(deep=True) 
x_features = df_num_std.loc[:, df_num_std.columns != 'Term']
cols = x_features.columns
df_num_std = pd.DataFrame(scaler.fit_transform(x_features), columns = cols)
df_num_std['Term'] = y_encoded
display('df_num_std:',HTML(df_num_std.head(10).to_html()))

# binary features
df_binary = df_num.copy(deep=True)
numCols = [0,2,3,4,5,6,7,8,9,10,11,12]
df_numerical = df_binary.iloc[:,numCols]
df_dummy = df_binary.drop(df_binary.columns[numCols], axis=1)
display('df_numerical:',HTML(df_numerical.head(10).to_html()))
display('df_dummy:',HTML(df_dummy.head(10).to_html()))

group_names = ['L','M','H']
for col in df_numerical.columns:
    df_numerical[col] = pd.cut(df_numerical[col], 3, labels=group_names)
display('df_numerical:',HTML(df_numerical.head(10).to_html()))

df_dummies=pd.get_dummies(df_numerical)
display('df_dummies:',HTML(df_dummies.head(10).to_html()))

cols_removed = ['Current Loan Amount_L','Credit Score_L', 'Annual Income_L', 'Monthly Debt_L', 'Years of Credit History_L', 'Months since last delinquent_L', 'Number of Open Accounts_L', 'Number of Credit Problems_L', 'Current Credit Balance_L', 'Maximum Open Credit_L', 'Bankruptcies_L', 'Tax Liens_L' ]
df_dummies = df_dummies.drop(cols_removed, axis=1)

# merge two dataframes
df_binary = pd.concat([df_dummies, df_dummy], axis=1)
display('df_binary:',HTML(df_binary.head(10).to_html()))

Column Datatypes:
 Loan Status                      object
Current Loan Amount               int64
Term                              int32
Credit Score                    float64
Annual Income                   float64
Years in current job             object
Home Ownership                   object
Purpose                          object
Monthly Debt                    float64
Years of Credit History         float64
Months since last delinquent    float64
Number of Open Accounts           int64
Number of Credit Problems         int64
Current Credit Balance            int64
Maximum Open Credit               int64
Bankruptcies                    float64
Tax Liens                       float64
dtype: object


Unnamed: 0,Current Loan Amount,Term,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens,Loan Status_Fully Paid,Years in current job_1 year,Years in current job_2 years,Years in current job_3 years,Years in current job_4 years,Years in current job_5 years,Years in current job_6 years,Years in current job_7 years,Years in current job_8 years,Years in current job_9 years,Years in current job_< 1 year,Home Ownership_HaveMortgage,Home Ownership_Home Mortgage,Home Ownership_Rent,Purpose_Business Loan,Purpose_Buy House,Purpose_Buy a Car,Purpose_Debt Consolidation,Purpose_Educational Expenses,Purpose_Home Improvements,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation
0,445412,1,709.0,1167493.0,5214.74,17.2,35.20179,6,1,228190,416746,1.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,262328,1,1094.310471,1376165.0,33295.98,21.1,8.0,35,0,229976,850784,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,99999999,1,741.0,2231892.0,29200.53,14.9,29.0,18,1,297996,750090,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,347666,0,721.0,806949.0,8741.9,12.0,35.20179,9,0,256329,386958,0.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,176220,1,1094.310471,1376165.0,20639.7,6.1,35.20179,15,0,253460,427174,0.0,0.0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5,206602,1,7290.0,896857.0,16367.74,17.3,35.20179,6,0,215308,272448,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6,217646,1,730.0,1184194.0,10855.08,19.6,10.0,13,1,122170,272052,1.0,0.0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7,648714,0,1094.310471,1376165.0,14806.13,8.2,8.0,15,0,193306,864204,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,548746,1,678.0,2559110.0,18660.28,22.6,33.0,4,0,437171,555038,0.0,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9,215952,1,739.0,1454735.0,39277.75,13.9,35.20179,20,0,669560,1021460,0.0,0.0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


'df_num_std:'

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens,Loan Status_Fully Paid,Years in current job_1 year,Years in current job_2 years,Years in current job_3 years,Years in current job_4 years,Years in current job_5 years,Years in current job_6 years,Years in current job_7 years,Years in current job_8 years,Years in current job_9 years,Years in current job_< 1 year,Home Ownership_HaveMortgage,Home Ownership_Home Mortgage,Home Ownership_Rent,Purpose_Business Loan,Purpose_Buy House,Purpose_Buy a Car,Purpose_Debt Consolidation,Purpose_Educational Expenses,Purpose_Home Improvements,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation,Term
0,-0.351545,-0.284234,-0.256844,-1.082788,-0.141489,0.0,-1.023607,1.748786,-0.161907,-0.05447,2.485469,-0.119026,0.542214,-0.261488,-0.314581,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,4.633779,-0.205582,-0.301347,-0.050063,1.029845,-0.861399,-0.122566,-0.082437,-0.113643,-1.942491,-0.036767,4.06175,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1
1,-0.35736,0.0,0.0,1.208568,0.422207,-1.8114,4.743922,-0.357677,-0.157709,0.010313,-0.340743,-0.119026,0.542214,-0.261488,-0.314581,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,-0.215807,-0.205582,-0.301347,-0.050063,1.029845,-0.861399,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1
2,2.810785,-0.260628,1.05327,0.87439,-0.473926,-0.412985,1.362957,1.748786,0.002173,-0.004716,-0.340743,-0.119026,0.542214,-0.261488,-0.314581,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,4.633779,-0.205582,-0.301347,-0.050063,-0.97102,-0.861399,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1
3,-0.35465,-0.275382,-0.700619,-0.794981,-0.893085,0.0,-0.426966,-0.357677,-0.095766,-0.058916,-0.340743,-0.119026,0.542214,-0.261488,-0.314581,3.45252,-0.259409,-0.271602,-0.250736,-0.243676,-0.215807,-0.205582,-0.301347,-0.050063,-0.97102,-0.861399,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,0
4,-0.360096,0.0,0.0,0.175848,-1.745856,0.0,0.766316,-0.357677,-0.102509,-0.052913,-0.340743,-0.119026,0.542214,-0.261488,-0.314581,-0.289643,-0.259409,3.681853,-0.250736,-0.243676,-0.215807,-0.205582,-0.301347,-0.050063,-0.97102,1.160902,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1
5,-0.359131,4.570401,-0.589956,-0.172732,-0.127036,0.0,-1.023607,-0.357677,-0.192186,-0.076007,-0.340743,-0.119026,-1.844291,-0.261488,-0.314581,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,-0.215807,-0.205582,-0.301347,-0.050063,1.029845,-0.861399,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1
6,-0.35878,-0.268742,-0.236288,-0.622551,0.205401,-1.678218,0.368555,1.748786,-0.411107,-0.076066,2.485469,-0.119026,0.542214,-0.261488,-0.314581,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,-0.215807,-0.205582,3.318435,-0.050063,1.029845,-0.861399,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1
7,-0.345087,0.0,0.0,-0.300156,-1.442327,-1.8114,0.766316,-0.357677,-0.243902,0.012316,-0.340743,-0.119026,-1.844291,-0.261488,-0.314581,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,-0.215807,-0.205582,3.318435,-0.050063,1.029845,-0.861399,-0.122566,12.130464,-0.113643,-1.942491,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,0
8,-0.348262,-0.307102,1.456025,0.014333,0.639014,-0.14662,-1.421368,-0.357677,0.329304,-0.033829,-0.340743,-0.119026,0.542214,-0.261488,3.178827,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,-0.215807,-0.205582,-0.301347,-0.050063,-0.97102,1.160902,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1
9,-0.358834,-0.262103,0.096707,1.696664,-0.618463,0.0,1.760717,-0.357677,0.875535,0.035787,-0.340743,-0.119026,0.542214,-0.261488,-0.314581,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,-0.215807,-0.205582,3.318435,-0.050063,-0.97102,1.160902,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1


'df_numerical:'

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,445412,709.0,1167493.0,5214.74,17.2,35.20179,6,1,228190,416746,1.0,0.0
1,262328,1094.310471,1376165.0,33295.98,21.1,8.0,35,0,229976,850784,0.0,0.0
2,99999999,741.0,2231892.0,29200.53,14.9,29.0,18,1,297996,750090,0.0,0.0
3,347666,721.0,806949.0,8741.9,12.0,35.20179,9,0,256329,386958,0.0,0.0
4,176220,1094.310471,1376165.0,20639.7,6.1,35.20179,15,0,253460,427174,0.0,0.0
5,206602,7290.0,896857.0,16367.74,17.3,35.20179,6,0,215308,272448,0.0,0.0
6,217646,730.0,1184194.0,10855.08,19.6,10.0,13,1,122170,272052,1.0,0.0
7,648714,1094.310471,1376165.0,14806.13,8.2,8.0,15,0,193306,864204,0.0,0.0
8,548746,678.0,2559110.0,18660.28,22.6,33.0,4,0,437171,555038,0.0,0.0
9,215952,739.0,1454735.0,39277.75,13.9,35.20179,20,0,669560,1021460,0.0,0.0


'df_dummy:'

Unnamed: 0,Term,Loan Status_Fully Paid,Years in current job_1 year,Years in current job_2 years,Years in current job_3 years,Years in current job_4 years,Years in current job_5 years,Years in current job_6 years,Years in current job_7 years,Years in current job_8 years,Years in current job_9 years,Years in current job_< 1 year,Home Ownership_HaveMortgage,Home Ownership_Home Mortgage,Home Ownership_Rent,Purpose_Business Loan,Purpose_Buy House,Purpose_Buy a Car,Purpose_Debt Consolidation,Purpose_Educational Expenses,Purpose_Home Improvements,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation
0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numerical[col] = pd.cut(df_numerical[col], 3, labels=group_names)


'df_numerical:'

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,L,L,L,L,L,L,L,L,L,L,L,L
1,L,L,L,L,L,L,M,L,L,L,L,L
2,H,L,L,L,L,L,L,L,L,L,L,L
3,L,L,L,L,L,L,L,L,L,L,L,L
4,L,L,L,L,L,L,L,L,L,L,L,L
5,L,H,L,L,L,L,L,L,L,L,L,L
6,L,L,L,L,L,L,L,L,L,L,L,L
7,L,L,L,L,L,L,L,L,L,L,L,L
8,L,L,L,L,L,L,L,L,L,L,L,L
9,L,L,L,L,L,L,M,L,L,L,L,L


'df_dummies:'

Unnamed: 0,Current Loan Amount_L,Current Loan Amount_M,Current Loan Amount_H,Credit Score_L,Credit Score_M,Credit Score_H,Annual Income_L,Annual Income_M,Annual Income_H,Monthly Debt_L,Monthly Debt_M,Monthly Debt_H,Years of Credit History_L,Years of Credit History_M,Years of Credit History_H,Months since last delinquent_L,Months since last delinquent_M,Months since last delinquent_H,Number of Open Accounts_L,Number of Open Accounts_M,Number of Open Accounts_H,Number of Credit Problems_L,Number of Credit Problems_M,Number of Credit Problems_H,Current Credit Balance_L,Current Credit Balance_M,Current Credit Balance_H,Maximum Open Credit_L,Maximum Open Credit_M,Maximum Open Credit_H,Bankruptcies_L,Bankruptcies_M,Bankruptcies_H,Tax Liens_L,Tax Liens_M,Tax Liens_H
0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
2,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
3,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
4,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
5,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
6,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
7,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
8,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
9,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0


'df_binary:'

Unnamed: 0,Current Loan Amount_M,Current Loan Amount_H,Credit Score_M,Credit Score_H,Annual Income_M,Annual Income_H,Monthly Debt_M,Monthly Debt_H,Years of Credit History_M,Years of Credit History_H,Months since last delinquent_M,Months since last delinquent_H,Number of Open Accounts_M,Number of Open Accounts_H,Number of Credit Problems_M,Number of Credit Problems_H,Current Credit Balance_M,Current Credit Balance_H,Maximum Open Credit_M,Maximum Open Credit_H,Bankruptcies_M,Bankruptcies_H,Tax Liens_M,Tax Liens_H,Term,Loan Status_Fully Paid,Years in current job_1 year,Years in current job_2 years,Years in current job_3 years,Years in current job_4 years,Years in current job_5 years,Years in current job_6 years,Years in current job_7 years,Years in current job_8 years,Years in current job_9 years,Years in current job_< 1 year,Home Ownership_HaveMortgage,Home Ownership_Home Mortgage,Home Ownership_Rent,Purpose_Business Loan,Purpose_Buy House,Purpose_Buy a Car,Purpose_Debt Consolidation,Purpose_Educational Expenses,Purpose_Home Improvements,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [58]:
# 03. CategoricalNB

from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import make_scorer, precision_score

y = df_binary['Term']
x = df_binary.drop('Term', axis=1)
clf = CategoricalNB(alpha=1)

precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=10, scoring='accuracy').mean()
prec=cross_val_score(clf, x, y, cv=10, scoring=precision).mean()
print("N-fold Cross Validation: accuracy = ",acc,', precision = ', prec)



Traceback (most recent call last):
  File "C:\Users\vikas\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\vikas\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\vikas\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 258, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\vikas\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 68, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\vikas\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 83, in predict
    jll = self._joint_log_likelihood(X)
  File "C:\Users\vikas\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 1461, in _joint_log_likelihood
    jll += self.feature_log_prob_[i][:, indices].T
IndexError: index 1 is out

N-fold Cross Validation: accuracy =  nan , precision =  nan


Traceback (most recent call last):
  File "C:\Users\vikas\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\vikas\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\vikas\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 258, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\vikas\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 68, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\vikas\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 83, in predict
    jll = self._joint_log_likelihood(X)
  File "C:\Users\vikas\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 1461, in _joint_log_likelihood
    jll += self.feature_log_prob_[i][:, indices].T
IndexError: index 1 is out

In [59]:
# 04. BernoulliNB

from sklearn.naive_bayes import BernoulliNB

# note that there is an argument, 'binarize', to help you convert numerical ones to binary features
# to set a threshold to this argument, your original features should be in same scale
clf = BernoulliNB(alpha=1)

precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=10, scoring='accuracy').mean()
prec=cross_val_score(clf, x, y, cv=10, scoring=precision).mean()
print("N-fold Cross Validation: accuracy = ",acc,', precision = ', prec)

N-fold Cross Validation: accuracy =  0.71105 , precision =  0.5926864252714714


In [60]:
# 05. GaussianNB

from sklearn.naive_bayes import GaussianNB

display('df_num:',HTML(df_num.head(10).to_html()))

y = df_num['Term']
x = df_num.drop('Term', axis=1)
clf = GaussianNB()

precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=10, scoring='accuracy').mean()
prec=cross_val_score(clf, x, y, cv=10, scoring=precision).mean()
print("N-fold Cross Validation: accuracy = ",acc,', precision = ', prec)

display('df_num_std:',HTML(df_num_std.head(10).to_html()))

y = df_num_std['Term']
x = df_num_std.drop('Term', axis=1)
clf = GaussianNB()

precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=10, scoring='accuracy').mean()
prec=cross_val_score(clf, x, y, cv=10, scoring=precision).mean()
print("N-fold Cross Validation: accuracy = ",acc,', precision = ', prec)

'df_num:'

Unnamed: 0,Current Loan Amount,Term,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens,Loan Status_Fully Paid,Years in current job_1 year,Years in current job_2 years,Years in current job_3 years,Years in current job_4 years,Years in current job_5 years,Years in current job_6 years,Years in current job_7 years,Years in current job_8 years,Years in current job_9 years,Years in current job_< 1 year,Home Ownership_HaveMortgage,Home Ownership_Home Mortgage,Home Ownership_Rent,Purpose_Business Loan,Purpose_Buy House,Purpose_Buy a Car,Purpose_Debt Consolidation,Purpose_Educational Expenses,Purpose_Home Improvements,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation
0,445412,1,709.0,1167493.0,5214.74,17.2,35.20179,6,1,228190,416746,1.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,262328,1,1094.310471,1376165.0,33295.98,21.1,8.0,35,0,229976,850784,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,99999999,1,741.0,2231892.0,29200.53,14.9,29.0,18,1,297996,750090,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,347666,0,721.0,806949.0,8741.9,12.0,35.20179,9,0,256329,386958,0.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,176220,1,1094.310471,1376165.0,20639.7,6.1,35.20179,15,0,253460,427174,0.0,0.0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5,206602,1,7290.0,896857.0,16367.74,17.3,35.20179,6,0,215308,272448,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6,217646,1,730.0,1184194.0,10855.08,19.6,10.0,13,1,122170,272052,1.0,0.0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7,648714,0,1094.310471,1376165.0,14806.13,8.2,8.0,15,0,193306,864204,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,548746,1,678.0,2559110.0,18660.28,22.6,33.0,4,0,437171,555038,0.0,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9,215952,1,739.0,1454735.0,39277.75,13.9,35.20179,20,0,669560,1021460,0.0,0.0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


N-fold Cross Validation: accuracy =  0.57165 , precision =  0.5739006554842075


'df_num_std:'

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens,Loan Status_Fully Paid,Years in current job_1 year,Years in current job_2 years,Years in current job_3 years,Years in current job_4 years,Years in current job_5 years,Years in current job_6 years,Years in current job_7 years,Years in current job_8 years,Years in current job_9 years,Years in current job_< 1 year,Home Ownership_HaveMortgage,Home Ownership_Home Mortgage,Home Ownership_Rent,Purpose_Business Loan,Purpose_Buy House,Purpose_Buy a Car,Purpose_Debt Consolidation,Purpose_Educational Expenses,Purpose_Home Improvements,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation,Term
0,-0.351545,-0.284234,-0.256844,-1.082788,-0.141489,0.0,-1.023607,1.748786,-0.161907,-0.05447,2.485469,-0.119026,0.542214,-0.261488,-0.314581,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,4.633779,-0.205582,-0.301347,-0.050063,1.029845,-0.861399,-0.122566,-0.082437,-0.113643,-1.942491,-0.036767,4.06175,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1
1,-0.35736,0.0,0.0,1.208568,0.422207,-1.8114,4.743922,-0.357677,-0.157709,0.010313,-0.340743,-0.119026,0.542214,-0.261488,-0.314581,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,-0.215807,-0.205582,-0.301347,-0.050063,1.029845,-0.861399,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1
2,2.810785,-0.260628,1.05327,0.87439,-0.473926,-0.412985,1.362957,1.748786,0.002173,-0.004716,-0.340743,-0.119026,0.542214,-0.261488,-0.314581,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,4.633779,-0.205582,-0.301347,-0.050063,-0.97102,-0.861399,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1
3,-0.35465,-0.275382,-0.700619,-0.794981,-0.893085,0.0,-0.426966,-0.357677,-0.095766,-0.058916,-0.340743,-0.119026,0.542214,-0.261488,-0.314581,3.45252,-0.259409,-0.271602,-0.250736,-0.243676,-0.215807,-0.205582,-0.301347,-0.050063,-0.97102,-0.861399,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,0
4,-0.360096,0.0,0.0,0.175848,-1.745856,0.0,0.766316,-0.357677,-0.102509,-0.052913,-0.340743,-0.119026,0.542214,-0.261488,-0.314581,-0.289643,-0.259409,3.681853,-0.250736,-0.243676,-0.215807,-0.205582,-0.301347,-0.050063,-0.97102,1.160902,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1
5,-0.359131,4.570401,-0.589956,-0.172732,-0.127036,0.0,-1.023607,-0.357677,-0.192186,-0.076007,-0.340743,-0.119026,-1.844291,-0.261488,-0.314581,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,-0.215807,-0.205582,-0.301347,-0.050063,1.029845,-0.861399,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1
6,-0.35878,-0.268742,-0.236288,-0.622551,0.205401,-1.678218,0.368555,1.748786,-0.411107,-0.076066,2.485469,-0.119026,0.542214,-0.261488,-0.314581,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,-0.215807,-0.205582,3.318435,-0.050063,1.029845,-0.861399,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1
7,-0.345087,0.0,0.0,-0.300156,-1.442327,-1.8114,0.766316,-0.357677,-0.243902,0.012316,-0.340743,-0.119026,-1.844291,-0.261488,-0.314581,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,-0.215807,-0.205582,3.318435,-0.050063,1.029845,-0.861399,-0.122566,12.130464,-0.113643,-1.942491,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,0
8,-0.348262,-0.307102,1.456025,0.014333,0.639014,-0.14662,-1.421368,-0.357677,0.329304,-0.033829,-0.340743,-0.119026,0.542214,-0.261488,3.178827,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,-0.215807,-0.205582,-0.301347,-0.050063,-0.97102,1.160902,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1
9,-0.358834,-0.262103,0.096707,1.696664,-0.618463,0.0,1.760717,-0.357677,0.875535,0.035787,-0.340743,-0.119026,0.542214,-0.261488,-0.314581,-0.289643,-0.259409,-0.271602,-0.250736,-0.243676,-0.215807,-0.205582,3.318435,-0.050063,-0.97102,1.160902,-0.122566,-0.082437,-0.113643,0.514803,-0.036767,-0.246199,-0.103259,-0.181965,-0.078342,-0.057541,-0.032421,-0.25006,-0.010001,-0.052513,-0.036767,1


N-fold Cross Validation: accuracy =  0.2989 , precision =  0.570899253692431


In [61]:
# 06. MultinomialNB 

from sklearn.naive_bayes import MultinomialNB 

display('df_num:',HTML(df_num.head(10).to_html()))

y = df_num['Term']
x = df_num.drop('Term', axis=1)
clf = MultinomialNB()

precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=10, scoring='accuracy').mean()
prec=cross_val_score(clf, x, y, cv=10, scoring=precision).mean()
print("N-fold Cross Validation: accuracy = ",acc,', precision = ', prec)

'df_num:'

Unnamed: 0,Current Loan Amount,Term,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens,Loan Status_Fully Paid,Years in current job_1 year,Years in current job_2 years,Years in current job_3 years,Years in current job_4 years,Years in current job_5 years,Years in current job_6 years,Years in current job_7 years,Years in current job_8 years,Years in current job_9 years,Years in current job_< 1 year,Home Ownership_HaveMortgage,Home Ownership_Home Mortgage,Home Ownership_Rent,Purpose_Business Loan,Purpose_Buy House,Purpose_Buy a Car,Purpose_Debt Consolidation,Purpose_Educational Expenses,Purpose_Home Improvements,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation
0,445412,1,709.0,1167493.0,5214.74,17.2,35.20179,6,1,228190,416746,1.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,262328,1,1094.310471,1376165.0,33295.98,21.1,8.0,35,0,229976,850784,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,99999999,1,741.0,2231892.0,29200.53,14.9,29.0,18,1,297996,750090,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,347666,0,721.0,806949.0,8741.9,12.0,35.20179,9,0,256329,386958,0.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,176220,1,1094.310471,1376165.0,20639.7,6.1,35.20179,15,0,253460,427174,0.0,0.0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5,206602,1,7290.0,896857.0,16367.74,17.3,35.20179,6,0,215308,272448,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6,217646,1,730.0,1184194.0,10855.08,19.6,10.0,13,1,122170,272052,1.0,0.0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7,648714,0,1094.310471,1376165.0,14806.13,8.2,8.0,15,0,193306,864204,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,548746,1,678.0,2559110.0,18660.28,22.6,33.0,4,0,437171,555038,0.0,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9,215952,1,739.0,1454735.0,39277.75,13.9,35.20179,20,0,669560,1021460,0.0,0.0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


N-fold Cross Validation: accuracy =  0.34754999999999997 , precision =  0.5405248946405947
