In [1]:
# Table of Contents

# 01. Introductions
# 02. KNN Classifier

In [3]:
# 01. Introductions

# Classification is a kind of supervised learning. It is used to predict the value in a nominal variable 
# which is also called 'label'. The factors that are used for predictions are called features.

import numpy as np
import scipy as sp
import pandas as pd
from IPython.display import display, HTML

df=pd.read_csv('data_students_10k.csv')
print(df.shape)
# strip column names
df=df.rename(columns=lambda x: x.strip())
cols=df.columns
# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))

# replace missing values in numerical variables by using mean value #################################
df["Age"].fillna(df["Age"].mean(), inplace=True)
df["Hours on Assignments"].fillna(df["Hours on Assignments"].mean(), inplace=True)
df["Hours on Games"].fillna(df["Hours on Games"].mean(), inplace=True)
df["Exam"].fillna(df["Exam"].mean(), inplace=True)
df["Grade"].fillna(df["Grade"].mean(), inplace=True)

# check again whether there are missing values
print('ColumnName, DataType, MissingValues')
for i in cols:
    print(i, ',', df[i].dtype,',',df[i].isnull().any())
 
# remove column ID
df=df.drop('ID',1)
df=df.drop('Grade',1)

# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))

(10000, 12)


Unnamed: 0,ID,Nationality,Gender,Age,Degree,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,Grade,GradeLetter
0,1,India,0,25,BS,14,2,14,6,43.67,51.73,F
1,2,India,0,24,BS,14,2,14,6,62.01,72.23,C
2,3,India,0,26,BS,14,2,14,6,45.03,54.37,F
3,4,India,0,21,BS,14,2,14,6,48.86,57.68,F
4,5,France,1,23,BS,14,2,2,7,80.37,88.41,A
5,6,Spain,1,18,PHD,12,1,7,4,89.29,89.7,A
6,7,India,1,22,MS,13,0,13,3,76.64,80.27,B
7,8,India,1,19,MS,13,0,13,3,89.34,86.9,B
8,9,India,1,25,MS,13,0,13,3,81.73,78.61,C
9,10,India,1,18,MS,13,0,13,3,75.28,80.79,B


ColumnName, DataType, MissingValues
ID , int64 , False
Nationality , object , False
Gender , int64 , False
Age , int64 , False
Degree , object , False
Hours on Readings , int64 , False
Hours on Assignments , int64 , False
Hours on Games , int64 , False
Hours on Internet , int64 , False
Exam , float64 , False
Grade , float64 , False
GradeLetter , object , False


  df=df.drop('ID',1)
  df=df.drop('Grade',1)


Unnamed: 0,Nationality,Gender,Age,Degree,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,GradeLetter
0,India,0,25,BS,14,2,14,6,43.67,F
1,India,0,24,BS,14,2,14,6,62.01,C
2,India,0,26,BS,14,2,14,6,45.03,F
3,India,0,21,BS,14,2,14,6,48.86,F
4,France,1,23,BS,14,2,2,7,80.37,A
5,Spain,1,18,PHD,12,1,7,4,89.29,A
6,India,1,22,MS,13,0,13,3,76.64,B
7,India,1,19,MS,13,0,13,3,89.34,B
8,India,1,25,MS,13,0,13,3,81.73,C
9,India,1,18,MS,13,0,13,3,75.28,B


In [4]:
# 02. KNN Classifier ################################################################################
# Requirements: 1). numerical features; 2). normalized features
# Parameters: distance measure and value of K


# Data preprocessing ################################################################################
print('Column Datatypes:\n',df.dtypes)

# convert all nominal variables to binary variables
df_raw=df.copy(deep=True) 
df_knn=df.copy(deep=True) 
# create new binary columns
df_dummies=pd.get_dummies(df_knn[['Degree','Nationality']])
# add them to dataframe
df_knn=df_knn.join(df_dummies)
# drop original columns
df_knn=df_knn.drop('Degree',axis=1)
df_knn=df_knn.drop('Nationality', axis=1)
display('Data Example:',HTML(df_knn.head(10).to_html()))

# drop extra binary columns, since we only need N-1 binary columns
print(df_knn.columns)
df_knn=df_knn.drop('Degree_ BS', axis=1)
df_knn=df_knn.drop('Nationality_ China', axis=1)

display('Data Example:',HTML(df_knn.head(10).to_html()))

Column Datatypes:
 Nationality              object
Gender                    int64
Age                       int64
Degree                   object
Hours on Readings         int64
Hours on Assignments      int64
Hours on Games            int64
Hours on Internet         int64
Exam                    float64
GradeLetter              object
dtype: object


'Data Example:'

Unnamed: 0,Gender,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,GradeLetter,Degree_ BS,Degree_ MS,Degree_ PHD,Nationality_ China,Nationality_ France,Nationality_ India,Nationality_ Spain
0,0,25,14,2,14,6,43.67,F,1,0,0,0,0,1,0
1,0,24,14,2,14,6,62.01,C,1,0,0,0,0,1,0
2,0,26,14,2,14,6,45.03,F,1,0,0,0,0,1,0
3,0,21,14,2,14,6,48.86,F,1,0,0,0,0,1,0
4,1,23,14,2,2,7,80.37,A,1,0,0,0,1,0,0
5,1,18,12,1,7,4,89.29,A,0,0,1,0,0,0,1
6,1,22,13,0,13,3,76.64,B,0,1,0,0,0,1,0
7,1,19,13,0,13,3,89.34,B,0,1,0,0,0,1,0
8,1,25,13,0,13,3,81.73,C,0,1,0,0,0,1,0
9,1,18,13,0,13,3,75.28,B,0,1,0,0,0,1,0


Index(['Gender', 'Age', 'Hours on Readings', 'Hours on Assignments',
       'Hours on Games', 'Hours on Internet', 'Exam', 'GradeLetter',
       'Degree_ BS', 'Degree_ MS', 'Degree_ PHD', 'Nationality_ China',
       'Nationality_ France', 'Nationality_ India', 'Nationality_ Spain'],
      dtype='object')


'Data Example:'

Unnamed: 0,Gender,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,GradeLetter,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain
0,0,25,14,2,14,6,43.67,F,0,0,0,1,0
1,0,24,14,2,14,6,62.01,C,0,0,0,1,0
2,0,26,14,2,14,6,45.03,F,0,0,0,1,0
3,0,21,14,2,14,6,48.86,F,0,0,0,1,0
4,1,23,14,2,2,7,80.37,A,0,0,1,0,0
5,1,18,12,1,7,4,89.29,A,0,1,0,0,1
6,1,22,13,0,13,3,76.64,B,1,0,0,1,0
7,1,19,13,0,13,3,89.34,B,1,0,0,1,0
8,1,25,13,0,13,3,81.73,C,1,0,0,1,0
9,1,18,13,0,13,3,75.28,B,1,0,0,1,0


In [5]:
# Normalized all numerical features
# min-max normalization to scale [0, 1]
for col in df_knn.columns:
    if col != 'GradeLetter':
        # exclude GradeLetter, since it is label in our data
        df_knn[col]=(df_knn[col]-df_knn[col].min())/(df_knn[col].max()-df_knn[col].min())

display(HTML(df_knn.head(10).to_html()))


Unnamed: 0,Gender,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,GradeLetter,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain
0,0.0,0.875,1.0,0.142857,1.0,0.428571,0.060854,F,0.0,0.0,0.0,1.0,0.0
1,0.0,0.75,1.0,0.142857,1.0,0.428571,0.366622,C,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.142857,1.0,0.428571,0.083528,F,0.0,0.0,0.0,1.0,0.0
3,0.0,0.375,1.0,0.142857,1.0,0.428571,0.147382,F,0.0,0.0,0.0,1.0,0.0
4,1.0,0.625,1.0,0.142857,0.142857,0.5,0.672724,A,0.0,0.0,1.0,0.0,0.0
5,1.0,0.0,0.857143,0.071429,0.5,0.285714,0.82144,A,0.0,1.0,0.0,0.0,1.0
6,1.0,0.5,0.928571,0.0,0.928571,0.214286,0.610537,B,1.0,0.0,0.0,1.0,0.0
7,1.0,0.125,0.928571,0.0,0.928571,0.214286,0.822274,B,1.0,0.0,0.0,1.0,0.0
8,1.0,0.875,0.928571,0.0,0.928571,0.214286,0.695398,C,1.0,0.0,0.0,1.0,0.0
9,1.0,0.0,0.928571,0.0,0.928571,0.214286,0.587863,B,1.0,0.0,0.0,1.0,0.0


In [6]:
# encode label, since KNN requires label encoding
from sklearn import preprocessing

y = df_knn['GradeLetter'] # define label as nominal values
le = preprocessing.LabelEncoder()
le.fit(y)
y_encoded = le.transform(y) # encode nominal labels to integers #####################################

print(y_encoded)

df_knn['GradeLetter'] = y_encoded
x_features = df_knn.drop('GradeLetter',1)

display(HTML(df_knn.head(10).to_html()))

[3 2 3 ... 0 0 0]


  x_features = df_knn.drop('GradeLetter',1)


Unnamed: 0,Gender,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,GradeLetter,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain
0,0.0,0.875,1.0,0.142857,1.0,0.428571,0.060854,3,0.0,0.0,0.0,1.0,0.0
1,0.0,0.75,1.0,0.142857,1.0,0.428571,0.366622,2,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.142857,1.0,0.428571,0.083528,3,0.0,0.0,0.0,1.0,0.0
3,0.0,0.375,1.0,0.142857,1.0,0.428571,0.147382,3,0.0,0.0,0.0,1.0,0.0
4,1.0,0.625,1.0,0.142857,0.142857,0.5,0.672724,0,0.0,0.0,1.0,0.0,0.0
5,1.0,0.0,0.857143,0.071429,0.5,0.285714,0.82144,0,0.0,1.0,0.0,0.0,1.0
6,1.0,0.5,0.928571,0.0,0.928571,0.214286,0.610537,1,1.0,0.0,0.0,1.0,0.0
7,1.0,0.125,0.928571,0.0,0.928571,0.214286,0.822274,1,1.0,0.0,0.0,1.0,0.0
8,1.0,0.875,0.928571,0.0,0.928571,0.214286,0.695398,2,1.0,0.0,0.0,1.0,0.0
9,1.0,0.0,0.928571,0.0,0.928571,0.214286,0.587863,1,1.0,0.0,0.0,1.0,0.0


In [12]:

# Build KNN models and evaluate the models ############################################################
# Note: for demo and teaching purpose, we present evaluations based on both hold-out and N-fold cross validations

# By hold-out evaluations ###############################################
from sklearn.model_selection import train_test_split

# example: 80% as training, 20% as testing
x_train, x_test, y_train, y_test = train_test_split(x_features, y_encoded, 
                                                    test_size=0.2)

# build and eval models
from sklearn import neighbors
from sklearn.metrics import accuracy_score
# API， https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# API for KNeighborsClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

kvalues = [10, 30, 50, 70, 90, 110, 130, 150, 170, 190, 210, 230, 250]
kvalues2 = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

for k in kvalues2: 
    clf=neighbors.KNeighborsClassifier(k, metric='euclidean')
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print('K =', k, ', Accuracy: ', accuracy_score(y_test, y_pred), ', Precision: ', precision_score(y_test, y_pred, average='macro', zero_division=0),
         ', Recall: ', recall_score(y_test, y_pred, average='macro', zero_division=0))
    
    # difference between macro and micro calculations
    # https://vitalflux.com/micro-average-macro-average-scoring-metrics-multi-class-classification-python/



K = 5 , Accuracy:  0.619 , Precision:  0.5492765212734355 , Recall:  0.5326023528862343
K = 10 , Accuracy:  0.65 , Precision:  0.5666423484747876 , Recall:  0.5122430102772392
K = 15 , Accuracy:  0.643 , Precision:  0.5438999524730438 , Recall:  0.4981451734136854
K = 20 , Accuracy:  0.6535 , Precision:  0.482921751569375 , Recall:  0.49560471608603357
K = 25 , Accuracy:  0.6505 , Precision:  0.490740941586644 , Recall:  0.49274896616430225
K = 30 , Accuracy:  0.6505 , Precision:  0.4447370786649877 , Recall:  0.4887586549923053
K = 35 , Accuracy:  0.6545 , Precision:  0.4510377663458254 , Recall:  0.4938249283603715
K = 40 , Accuracy:  0.6515 , Precision:  0.44794396104510537 , Recall:  0.49073753021640715
K = 45 , Accuracy:  0.65 , Precision:  0.44809027582073624 , Recall:  0.49074542076355576
K = 50 , Accuracy:  0.649 , Precision:  0.4470002209177605 , Recall:  0.48964447862017957


In [15]:

# By N-fold cross evaluations ###############################################
from sklearn.model_selection import cross_val_score

kvalues = [10, 30, 50, 70, 90, 110, 130, 150, 170, 190, 210, 230, 250]
kvalues2 = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

for k in kvalues: 
    clf=neighbors.KNeighborsClassifier(k, metric='euclidean')
    acc=cross_val_score(clf, x_features, y_encoded, cv=5, scoring='accuracy').mean()
    print('K =', k, ', Accuracy: ',acc)

K = 10 , Accuracy:  0.49160000000000004
K = 30 , Accuracy:  0.5166999999999999
K = 50 , Accuracy:  0.5261000000000001
K = 70 , Accuracy:  0.5305000000000001
K = 90 , Accuracy:  0.5246999999999999
K = 110 , Accuracy:  0.5302
K = 130 , Accuracy:  0.5330000000000001
K = 150 , Accuracy:  0.5359999999999999
K = 170 , Accuracy:  0.5386
K = 190 , Accuracy:  0.5423
K = 210 , Accuracy:  0.5429999999999999
K = 230 , Accuracy:  0.5462999999999999
K = 250 , Accuracy:  0.5483


In [16]:
# In-Class Practice: using the Loans data for practice and assignments