In [1]:
# Table of Contents

# 01. Introductions
# 02. Naive Bayes Classifiers
# 03. CategoricalNB
# 04. BernoulliNB
# 05. GaussianNB
# 06. MultinomialNB 
# 07. ComplementNB


In [2]:
# 01. Introductions

# Classification is a kind of supervised learning. It is used to predict the value in a nominal variable 
# which is also called 'label'. The factors that are used for predictions are called features.

import pandas as pd
from sklearn import preprocessing
from IPython.display import display, HTML

df=pd.read_csv('data_students_10k.csv')
print(df.shape)
# strip column names
df=df.rename(columns=lambda x: x.strip())
cols=df.columns
# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))

# replace missing values in numerical variables by using mean value #################################
df["Age"].fillna(df["Age"].mean(), inplace=True)
df["Hours on Assignments"].fillna(df["Hours on Assignments"].mean(), inplace=True)
df["Hours on Games"].fillna(df["Hours on Games"].mean(), inplace=True)
df["Exam"].fillna(df["Exam"].mean(), inplace=True)
df["Grade"].fillna(df["Grade"].mean(), inplace=True)

# check again whether there are missing values
print('ColumnName, DataType, MissingValues')
for i in cols:
    print(i, ',', df[i].dtype,',',df[i].isnull().any())
 
# remove column ID and grade which are not appropriate to be included in this classification task
df=df.drop('ID',1)
df=df.drop('Grade',1)

# encode labels
y = df['GradeLetter'] # define label as nominal values
le = preprocessing.LabelEncoder()
le.fit(y)
y_encoded = le.transform(y) # encode nominal labels to integers #####################################
df['GradeLetter'] = y_encoded

# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))

(10000, 12)


Unnamed: 0,ID,Nationality,Gender,Age,Degree,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,Grade,GradeLetter
0,1,India,0,25,BS,14,2,14,6,43.67,51.73,F
1,2,India,0,24,BS,14,2,14,6,62.01,72.23,C
2,3,India,0,26,BS,14,2,14,6,45.03,54.37,F
3,4,India,0,21,BS,14,2,14,6,48.86,57.68,F
4,5,France,1,23,BS,14,2,2,7,80.37,88.41,A
5,6,Spain,1,18,PHD,12,1,7,4,89.29,89.7,A
6,7,India,1,22,MS,13,0,13,3,76.64,80.27,B
7,8,India,1,19,MS,13,0,13,3,89.34,86.9,B
8,9,India,1,25,MS,13,0,13,3,81.73,78.61,C
9,10,India,1,18,MS,13,0,13,3,75.28,80.79,B


ColumnName, DataType, MissingValues
ID , int64 , False
Nationality , object , False
Gender , int64 , False
Age , int64 , False
Degree , object , False
Hours on Readings , int64 , False
Hours on Assignments , int64 , False
Hours on Games , int64 , False
Hours on Internet , int64 , False
Exam , float64 , False
Grade , float64 , False
GradeLetter , object , False


  df=df.drop('ID',1)
  df=df.drop('Grade',1)


Unnamed: 0,Nationality,Gender,Age,Degree,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,GradeLetter
0,India,0,25,BS,14,2,14,6,43.67,3
1,India,0,24,BS,14,2,14,6,62.01,2
2,India,0,26,BS,14,2,14,6,45.03,3
3,India,0,21,BS,14,2,14,6,48.86,3
4,France,1,23,BS,14,2,2,7,80.37,0
5,Spain,1,18,PHD,12,1,7,4,89.29,0
6,India,1,22,MS,13,0,13,3,76.64,1
7,India,1,19,MS,13,0,13,3,89.34,1
8,India,1,25,MS,13,0,13,3,81.73,2
9,India,1,18,MS,13,0,13,3,75.28,1


In [37]:
# 02. Naive Bayes Classifier ################################################################################
# There are five Naive Bayes algorithms provided by scikit-learn, https://scikit-learn.org/stable/modules/naive_bayes.html

# 03. CategoricalNB
# It is used to deal with categorical features only. This is the one we introduced in the class.
# You need to convert you feature space to binary features, in order to utilize this algorithm
# For nominal variables, just create N-1 binary variables
# For numerical variables, convert them to nominal ones, and then to N-1 binary variables

# 04. BernoulliNB
# It is used to deal with binary features.
# You can perform the same preprocessing as above.

# 05. GaussianNB
# It is used to deal with numerical features.
# It uses a differernt likelihood estimation function.
# For numerical variabels, you do not need any preprocessing.
# For nominal variables, you should convert them to N-1 binary variables.
# You may try standarized features, which may bring extra improvements

# 06. MultinomialNB 
# It can deal with mixed types of features.
# However, scikit-learn cannot deal with nominal features directly
# Therefore, you should perform similar preprocessing as above

# 07. ComplementNB
# It is an improved version of MultinomialNB
# It can address the issues by imbalance data
# similar preprocessing as above.


# Therefore, let's test different preprocessing
# df_binary = all binary features, for CategoricalNB and BernoulliNB
# df_num =  all numerical features, for GaussianNB, MultinomialNB, ComplementNB
# df_num_std = standarized numerical features

# Data preprocessing ################################################################################
print('Column Datatypes:\n',df.dtypes)

# convert all nominal variables to binary variables
df_num=df.copy(deep=True) 
# create new binary columns
df_dummies=pd.get_dummies(df_num[['Degree','Nationality']])
# add them to dataframe
df_num=df_num.join(df_dummies)
# drop original columns
df_num=df_num.drop('Degree',axis=1)
df_num=df_num.drop('Nationality', axis=1)

# drop extra binary columns, since we only need N-1 binary columns
df_num=df_num.drop('Degree_ BS', axis=1)
df_num=df_num.drop('Nationality_ China', axis=1)

display('df_num:',HTML(df_num.head(10).to_html()))

# standarized data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_num_std = df_num.copy(deep=True) 
x_features = df_num_std.loc[:, df_num_std.columns != 'GradeLetter']
cols = x_features.columns
df_num_std = pd.DataFrame(scaler.fit_transform(x_features), columns = cols)
df_num_std['GradeLetter'] = y_encoded
display('df_num_std:',HTML(df_num_std.head(10).to_html()))

# binary features
df_binary = df_num.copy(deep=True)
numCols = [1,2,3,4,5,6]
df_numerical = df_binary.iloc[:,numCols]
df_dummy = df_binary.drop(df_binary.columns[numCols], axis=1)
display('df_numerical:',HTML(df_numerical.head(10).to_html()))
display('df_dummy:',HTML(df_dummy.head(10).to_html()))

group_names = ['L','M','H']
for col in df_numerical.columns:
    df_numerical[col] = pd.cut(df_numerical[col], 3, labels=group_names)
display('df_numerical:',HTML(df_numerical.head(10).to_html()))

df_dummies=pd.get_dummies(df_numerical)
display('df_dummies:',HTML(df_dummies.head(10).to_html()))
cols_removed = ['Age_L', 'Hours on Readings_L', 'Hours on Assignments_L', 'Hours on Games_L', 'Hours on Internet_L', 'Exam_L']
df_dummies = df_dummies.drop(cols_removed, axis=1)

# merge two dataframes
df_binary = pd.concat([df_dummies, df_dummy], axis=1)
display('df_binary:',HTML(df_binary.head(10).to_html()))

Column Datatypes:
 Nationality              object
Gender                    int64
Age                       int64
Degree                   object
Hours on Readings         int64
Hours on Assignments      int64
Hours on Games            int64
Hours on Internet         int64
Exam                    float64
GradeLetter               int32
dtype: object


'df_num:'

Unnamed: 0,Gender,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,GradeLetter,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain
0,0,25,14,2,14,6,43.67,3,0,0,0,1,0
1,0,24,14,2,14,6,62.01,2,0,0,0,1,0
2,0,26,14,2,14,6,45.03,3,0,0,0,1,0
3,0,21,14,2,14,6,48.86,3,0,0,0,1,0
4,1,23,14,2,2,7,80.37,0,0,0,1,0,0
5,1,18,12,1,7,4,89.29,0,0,1,0,0,1
6,1,22,13,0,13,3,76.64,1,1,0,0,1,0
7,1,19,13,0,13,3,89.34,1,1,0,0,1,0
8,1,25,13,0,13,3,81.73,2,1,0,0,1,0
9,1,18,13,0,13,3,75.28,1,1,0,0,1,0


'df_num_std:'

Unnamed: 0,Gender,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain,GradeLetter
0,-0.953481,1.179901,1.634258,-1.142823,1.591185,-0.21788,-2.241144,-0.689938,-0.765699,-0.584433,1.715581,-0.56919,3
1,-0.953481,0.790057,1.634258,-1.142823,1.591185,-0.21788,-1.086122,-0.689938,-0.765699,-0.584433,1.715581,-0.56919,2
2,-0.953481,1.569745,1.634258,-1.142823,1.591185,-0.21788,-2.155493,-0.689938,-0.765699,-0.584433,1.715581,-0.56919,3
3,-0.953481,-0.379474,1.634258,-1.142823,1.591185,-0.21788,-1.914286,-0.689938,-0.765699,-0.584433,1.715581,-0.56919,3
4,1.048789,0.400214,1.634258,-1.142823,-1.174971,0.013047,0.070159,-0.689938,-0.765699,1.711061,-0.582893,-0.56919,0
5,1.048789,-1.549005,1.146012,-1.376205,-0.022406,-0.679734,0.631926,-0.689938,1.305997,-0.584433,-0.582893,1.756883,0
6,1.048789,0.01037,1.390135,-1.609586,1.360672,-0.910661,-0.16475,1.449405,-0.765699,-0.584433,1.715581,-0.56919,1
7,1.048789,-1.159161,1.390135,-1.609586,1.360672,-0.910661,0.635075,1.449405,-0.765699,-0.584433,1.715581,-0.56919,1
8,1.048789,1.179901,1.390135,-1.609586,1.360672,-0.910661,0.15581,1.449405,-0.765699,-0.584433,1.715581,-0.56919,2
9,1.048789,-1.549005,1.390135,-1.609586,1.360672,-0.910661,-0.2504,1.449405,-0.765699,-0.584433,1.715581,-0.56919,1


'df_numerical:'

Unnamed: 0,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam
0,25,14,2,14,6,43.67
1,24,14,2,14,6,62.01
2,26,14,2,14,6,45.03
3,21,14,2,14,6,48.86
4,23,14,2,2,7,80.37
5,18,12,1,7,4,89.29
6,22,13,0,13,3,76.64
7,19,13,0,13,3,89.34
8,25,13,0,13,3,81.73
9,18,13,0,13,3,75.28


'df_dummy:'

Unnamed: 0,Gender,GradeLetter,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain
0,0,3,0,0,0,1,0
1,0,2,0,0,0,1,0
2,0,3,0,0,0,1,0
3,0,3,0,0,0,1,0
4,1,0,0,0,1,0,0
5,1,0,0,1,0,0,1
6,1,1,1,0,0,1,0
7,1,1,1,0,0,1,0
8,1,2,1,0,0,1,0
9,1,1,1,0,0,1,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numerical[col] = pd.cut(df_numerical[col], 3, labels=group_names)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numerical[col] = pd.cut(df_numerical[col], 3, labels=group_names)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numerical[col] = pd.cut(df_numerical[col], 3, labels=group_names

'df_numerical:'

Unnamed: 0,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam
0,H,H,L,H,M,L
1,H,H,L,H,M,M
2,H,H,L,H,M,L
3,M,H,L,H,M,L
4,M,H,L,L,M,H
5,L,H,L,M,L,H
6,M,H,L,H,L,M
7,L,H,L,H,L,H
8,H,H,L,H,L,H
9,L,H,L,H,L,M


'df_dummies:'

Unnamed: 0,Age_L,Age_M,Age_H,Hours on Readings_L,Hours on Readings_M,Hours on Readings_H,Hours on Assignments_L,Hours on Assignments_M,Hours on Assignments_H,Hours on Games_L,Hours on Games_M,Hours on Games_H,Hours on Internet_L,Hours on Internet_M,Hours on Internet_H,Exam_L,Exam_M,Exam_H
0,0,0,1,0,0,1,1,0,0,0,0,1,0,1,0,1,0,0
1,0,0,1,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0
2,0,0,1,0,0,1,1,0,0,0,0,1,0,1,0,1,0,0
3,0,1,0,0,0,1,1,0,0,0,0,1,0,1,0,1,0,0
4,0,1,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,1
5,1,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,1
6,0,1,0,0,0,1,1,0,0,0,0,1,1,0,0,0,1,0
7,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,0,1
8,0,0,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,1
9,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,1,0


'df_binary:'

Unnamed: 0,Age_M,Age_H,Hours on Readings_M,Hours on Readings_H,Hours on Assignments_M,Hours on Assignments_H,Hours on Games_M,Hours on Games_H,Hours on Internet_M,Hours on Internet_H,Exam_M,Exam_H,Gender,GradeLetter,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain
0,0,1,0,1,0,0,0,1,1,0,0,0,0,3,0,0,0,1,0
1,0,1,0,1,0,0,0,1,1,0,1,0,0,2,0,0,0,1,0
2,0,1,0,1,0,0,0,1,1,0,0,0,0,3,0,0,0,1,0
3,1,0,0,1,0,0,0,1,1,0,0,0,0,3,0,0,0,1,0
4,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0
5,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1
6,1,0,0,1,0,0,0,1,0,0,1,0,1,1,1,0,0,1,0
7,0,0,0,1,0,0,0,1,0,0,0,1,1,1,1,0,0,1,0
8,0,1,0,1,0,0,0,1,0,0,0,1,1,2,1,0,0,1,0
9,0,0,0,1,0,0,0,1,0,0,1,0,1,1,1,0,0,1,0


In [51]:
# 03. CategoricalNB
# It is used to deal with categorical features only. This is the one we introduced in the class.
# You need to convert you feature space to binary features, in order to utilize this algorithm
# For nominal variables, just create N-1 binary variables
# For numerical variables, convert them to nominal ones, and then to N-1 binary variables
# use df_binary
# API, https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.CategoricalNB.html#sklearn.naive_bayes.CategoricalNB

from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import make_scorer, precision_score

display('df_binary:',HTML(df_binary.head(10).to_html()))

y = df_binary['GradeLetter']
x = df_binary.drop('GradeLetter', axis=1)
clf = CategoricalNB(alpha=1)

precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
prec=cross_val_score(clf, x, y, cv=5, scoring=precision).mean()
print("N-fold Cross Validation: accuracy = ",acc,', precision = ', prec)

'df_binary:'

Unnamed: 0,Age_M,Age_H,Hours on Readings_M,Hours on Readings_H,Hours on Assignments_M,Hours on Assignments_H,Hours on Games_M,Hours on Games_H,Hours on Internet_M,Hours on Internet_H,Exam_M,Exam_H,Gender,GradeLetter,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain
0,0,1,0,1,0,0,0,1,1,0,0,0,0,3,0,0,0,1,0
1,0,1,0,1,0,0,0,1,1,0,1,0,0,2,0,0,0,1,0
2,0,1,0,1,0,0,0,1,1,0,0,0,0,3,0,0,0,1,0
3,1,0,0,1,0,0,0,1,1,0,0,0,0,3,0,0,0,1,0
4,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0
5,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1
6,1,0,0,1,0,0,0,1,0,0,1,0,1,1,1,0,0,1,0
7,0,0,0,1,0,0,0,1,0,0,0,1,1,1,1,0,0,1,0
8,0,1,0,1,0,0,0,1,0,0,0,1,1,2,1,0,0,1,0
9,0,0,0,1,0,0,0,1,0,0,1,0,1,1,1,0,0,1,0


N-fold Cross Validation: accuracy =  0.6073000000000001 , precision =  0.4734475433438125


  _warn_prf(average, modifier, msg_start, len(result))


In [53]:
# 04. BernoulliNB
# It is used to deal with binary features.
# You can perform the same preprocessing as above.
# use df_binary
# API, https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB

from sklearn.naive_bayes import BernoulliNB

# note that there is an argument, 'binarize', to help you convert numerical ones to binary features
# to set a threshold to this argument, your original features should be in same scale
clf = BernoulliNB(alpha=1)

precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
prec=cross_val_score(clf, x, y, cv=5, scoring=precision).mean()
print("N-fold Cross Validation: accuracy = ",acc,', precision = ', prec)

N-fold Cross Validation: accuracy =  0.6073000000000001 , precision =  0.4734475433438125


  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
# 05. GaussianNB
# It is used to deal with numerical features.
# It uses a differernt likelihood estimation function.
# For numerical variabels, you do not need any preprocessing.
# For nominal variables, you should convert them to N-1 binary variables.
# You may try standarized features, which may bring extra improvements
# try df_num and df_num_std
# API, https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB

from sklearn.naive_bayes import GaussianNB

display('df_num:',HTML(df_num.head(10).to_html()))

y = df_num['GradeLetter']
x = df_num.drop('GradeLetter', axis=1)
clf = GaussianNB()

precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
prec=cross_val_score(clf, x, y, cv=5, scoring=precision).mean()
print("N-fold Cross Validation: accuracy = ",acc,', precision = ', prec)

display('df_num_std:',HTML(df_num_std.head(10).to_html()))

y = df_num_std['GradeLetter']
x = df_num_std.drop('GradeLetter', axis=1)
clf = GaussianNB()

precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
prec=cross_val_score(clf, x, y, cv=5, scoring=precision).mean()
print("N-fold Cross Validation: accuracy = ",acc,', precision = ', prec)

'df_num:'

Unnamed: 0,Gender,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,GradeLetter,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain
0,0,25,14,2,14,6,43.67,3,0,0,0,1,0
1,0,24,14,2,14,6,62.01,2,0,0,0,1,0
2,0,26,14,2,14,6,45.03,3,0,0,0,1,0
3,0,21,14,2,14,6,48.86,3,0,0,0,1,0
4,1,23,14,2,2,7,80.37,0,0,0,1,0,0
5,1,18,12,1,7,4,89.29,0,0,1,0,0,1
6,1,22,13,0,13,3,76.64,1,1,0,0,1,0
7,1,19,13,0,13,3,89.34,1,1,0,0,1,0
8,1,25,13,0,13,3,81.73,2,1,0,0,1,0
9,1,18,13,0,13,3,75.28,1,1,0,0,1,0


N-fold Cross Validation: accuracy =  0.6169 , precision =  0.5674085901515645


'df_num_std:'

Unnamed: 0,Gender,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain,GradeLetter
0,-0.953481,1.179901,1.634258,-1.142823,1.591185,-0.21788,-2.241144,-0.689938,-0.765699,-0.584433,1.715581,-0.56919,3
1,-0.953481,0.790057,1.634258,-1.142823,1.591185,-0.21788,-1.086122,-0.689938,-0.765699,-0.584433,1.715581,-0.56919,2
2,-0.953481,1.569745,1.634258,-1.142823,1.591185,-0.21788,-2.155493,-0.689938,-0.765699,-0.584433,1.715581,-0.56919,3
3,-0.953481,-0.379474,1.634258,-1.142823,1.591185,-0.21788,-1.914286,-0.689938,-0.765699,-0.584433,1.715581,-0.56919,3
4,1.048789,0.400214,1.634258,-1.142823,-1.174971,0.013047,0.070159,-0.689938,-0.765699,1.711061,-0.582893,-0.56919,0
5,1.048789,-1.549005,1.146012,-1.376205,-0.022406,-0.679734,0.631926,-0.689938,1.305997,-0.584433,-0.582893,1.756883,0
6,1.048789,0.01037,1.390135,-1.609586,1.360672,-0.910661,-0.16475,1.449405,-0.765699,-0.584433,1.715581,-0.56919,1
7,1.048789,-1.159161,1.390135,-1.609586,1.360672,-0.910661,0.635075,1.449405,-0.765699,-0.584433,1.715581,-0.56919,1
8,1.048789,1.179901,1.390135,-1.609586,1.360672,-0.910661,0.15581,1.449405,-0.765699,-0.584433,1.715581,-0.56919,2
9,1.048789,-1.549005,1.390135,-1.609586,1.360672,-0.910661,-0.2504,1.449405,-0.765699,-0.584433,1.715581,-0.56919,1


N-fold Cross Validation: accuracy =  0.6169 , precision =  0.5674085901515645


In [57]:
# 06. MultinomialNB 
# It can deal with mixed types of features.
# However, scikit-learn cannot deal with nominal features directly
# Therefore, you should perform similar preprocessing as above
# try df_num and df_num_std
# API, https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB

from sklearn.naive_bayes import MultinomialNB 

display('df_num:',HTML(df_num.head(10).to_html()))

y = df_num['GradeLetter']
x = df_num.drop('GradeLetter', axis=1)
clf = MultinomialNB()

precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
prec=cross_val_score(clf, x, y, cv=5, scoring=precision).mean()
print("N-fold Cross Validation: accuracy = ",acc,', precision = ', prec)


# note: df_num_std does not work here, since MultinomialNB cannot work with negative values in features
# display('df_num_std:',HTML(df_num_std.head(10).to_html()))

# y = df_num_std['GradeLetter']
# x = df_num_std.drop('GradeLetter', axis=1)
# clf = MultinomialNB()

# precision = make_scorer(precision_score, average='macro')
# acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
# prec=cross_val_score(clf, x, y, cv=5, scoring=precision).mean()
# print("N-fold Cross Validation: accuracy = ",acc,', precision = ', prec)

'df_num:'

Unnamed: 0,Gender,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,GradeLetter,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain
0,0,25,14,2,14,6,43.67,3,0,0,0,1,0
1,0,24,14,2,14,6,62.01,2,0,0,0,1,0
2,0,26,14,2,14,6,45.03,3,0,0,0,1,0
3,0,21,14,2,14,6,48.86,3,0,0,0,1,0
4,1,23,14,2,2,7,80.37,0,0,0,1,0,0
5,1,18,12,1,7,4,89.29,0,0,1,0,0,1
6,1,22,13,0,13,3,76.64,1,1,0,0,1,0
7,1,19,13,0,13,3,89.34,1,1,0,0,1,0
8,1,25,13,0,13,3,81.73,2,1,0,0,1,0
9,1,18,13,0,13,3,75.28,1,1,0,0,1,0


N-fold Cross Validation: accuracy =  0.5894999999999999 , precision =  0.5403330266866953


In [58]:
# 07. ComplementNB
# It is an improved version of MultinomialNB
# It can address the issues by imbalance data
# similar preprocessing as above.
# try df_num and df_num_std

# API, https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.ComplementNB.html#sklearn.naive_bayes.ComplementNB

from sklearn.naive_bayes import ComplementNB 

display('df_num:',HTML(df_num.head(10).to_html()))

y = df_num['GradeLetter']
x = df_num.drop('GradeLetter', axis=1)
clf = ComplementNB()

precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
prec=cross_val_score(clf, x, y, cv=5, scoring=precision).mean()
print("N-fold Cross Validation: accuracy = ",acc,', precision = ', prec)

'df_num:'

Unnamed: 0,Gender,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,GradeLetter,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain
0,0,25,14,2,14,6,43.67,3,0,0,0,1,0
1,0,24,14,2,14,6,62.01,2,0,0,0,1,0
2,0,26,14,2,14,6,45.03,3,0,0,0,1,0
3,0,21,14,2,14,6,48.86,3,0,0,0,1,0
4,1,23,14,2,2,7,80.37,0,0,0,1,0,0
5,1,18,12,1,7,4,89.29,0,0,1,0,0,1
6,1,22,13,0,13,3,76.64,1,1,0,0,1,0
7,1,19,13,0,13,3,89.34,1,1,0,0,1,0
8,1,25,13,0,13,3,81.73,2,1,0,0,1,0
9,1,18,13,0,13,3,75.28,1,1,0,0,1,0


N-fold Cross Validation: accuracy =  0.6131 , precision =  0.3062231901334561


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# In-Class Practice: using the Loans data for practice and assignments