In [1]:
# Table of Contents

# 01. Introductions
# 02. Bagging - RandomForest
# 03. Bagging
# 04. AdaBoosting
# 05. Gradient Boosting
# 06. XGBoost


In [2]:
# 01. Introductions

import pandas as pd
from sklearn import preprocessing
from IPython.display import display, HTML

df=pd.read_csv('data_students_10k.csv')
print(df.shape)
# strip column names
df=df.rename(columns=lambda x: x.strip())
cols=df.columns
# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))

# replace missing values in numerical variables by using mean value #################################
df["Age"].fillna(df["Age"].mean(), inplace=True)
df["Hours on Assignments"].fillna(df["Hours on Assignments"].mean(), inplace=True)
df["Hours on Games"].fillna(df["Hours on Games"].mean(), inplace=True)
df["Exam"].fillna(df["Exam"].mean(), inplace=True)
df["Grade"].fillna(df["Grade"].mean(), inplace=True)

# check again whether there are missing values
print('ColumnName, DataType, MissingValues')
for i in cols:
    print(i, ',', df[i].dtype,',',df[i].isnull().any())
 
# remove column ID and grade which are not appropriate to be included in this classification task
df=df.drop('ID',1)
df=df.drop('Grade',1)

# encode labels
y = df['GradeLetter'] # define label as nominal values
le = preprocessing.LabelEncoder()
le.fit(y)
y_encoded = le.transform(y) # encode nominal labels to integers #####################################
df['GradeLetter'] = y_encoded

# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))

print('Column Datatypes:\n',df.dtypes)

# convert all nominal variables to binary variables
df_num=df.copy(deep=True) 
# create new binary columns
df_dummies=pd.get_dummies(df_num[['Degree','Nationality']])
# add them to dataframe
df_num=df_num.join(df_dummies)
# drop original columns
df_num=df_num.drop('Degree',axis=1)
df_num=df_num.drop('Nationality', axis=1)

# drop extra binary columns, since we only need N-1 binary columns
df_num=df_num.drop('Degree_ BS', axis=1)
df_num=df_num.drop('Nationality_ China', axis=1)

display('df_num:',HTML(df_num.head(10).to_html()))


(10000, 12)


Unnamed: 0,ID,Nationality,Gender,Age,Degree,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,Grade,GradeLetter
0,1,India,0,25,BS,14,2,14,6,43.67,51.73,F
1,2,India,0,24,BS,14,2,14,6,62.01,72.23,C
2,3,India,0,26,BS,14,2,14,6,45.03,54.37,F
3,4,India,0,21,BS,14,2,14,6,48.86,57.68,F
4,5,France,1,23,BS,14,2,2,7,80.37,88.41,A
5,6,Spain,1,18,PHD,12,1,7,4,89.29,89.7,A
6,7,India,1,22,MS,13,0,13,3,76.64,80.27,B
7,8,India,1,19,MS,13,0,13,3,89.34,86.9,B
8,9,India,1,25,MS,13,0,13,3,81.73,78.61,C
9,10,India,1,18,MS,13,0,13,3,75.28,80.79,B


ColumnName, DataType, MissingValues
ID , int64 , False
Nationality , object , False
Gender , int64 , False
Age , int64 , False
Degree , object , False
Hours on Readings , int64 , False
Hours on Assignments , int64 , False
Hours on Games , int64 , False
Hours on Internet , int64 , False
Exam , float64 , False
Grade , float64 , False
GradeLetter , object , False


  df=df.drop('ID',1)
  df=df.drop('Grade',1)


Unnamed: 0,Nationality,Gender,Age,Degree,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,GradeLetter
0,India,0,25,BS,14,2,14,6,43.67,3
1,India,0,24,BS,14,2,14,6,62.01,2
2,India,0,26,BS,14,2,14,6,45.03,3
3,India,0,21,BS,14,2,14,6,48.86,3
4,France,1,23,BS,14,2,2,7,80.37,0
5,Spain,1,18,PHD,12,1,7,4,89.29,0
6,India,1,22,MS,13,0,13,3,76.64,1
7,India,1,19,MS,13,0,13,3,89.34,1
8,India,1,25,MS,13,0,13,3,81.73,2
9,India,1,18,MS,13,0,13,3,75.28,1


Column Datatypes:
 Nationality              object
Gender                    int64
Age                       int64
Degree                   object
Hours on Readings         int64
Hours on Assignments      int64
Hours on Games            int64
Hours on Internet         int64
Exam                    float64
GradeLetter               int32
dtype: object


'df_num:'

Unnamed: 0,Gender,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,GradeLetter,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain
0,0,25,14,2,14,6,43.67,3,0,0,0,1,0
1,0,24,14,2,14,6,62.01,2,0,0,0,1,0
2,0,26,14,2,14,6,45.03,3,0,0,0,1,0
3,0,21,14,2,14,6,48.86,3,0,0,0,1,0
4,1,23,14,2,2,7,80.37,0,0,0,1,0,0
5,1,18,12,1,7,4,89.29,0,0,1,0,0,1
6,1,22,13,0,13,3,76.64,1,1,0,0,1,0
7,1,19,13,0,13,3,89.34,1,1,0,0,1,0
8,1,25,13,0,13,3,81.73,2,1,0,0,1,0
9,1,18,13,0,13,3,75.28,1,1,0,0,1,0


In [3]:
# 02. Bagging - RandomForest

# Preprocessing: same requirements as KNN, not necessary for normalization

# API
# https://scikit-learn.org/0.16/modules/generated/sklearn.linear_model.LogisticRegression.html

from sklearn import metrics 
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, precision_score, accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html?highlight=decisiontreeclassifier#sklearn.tree.DecisionTreeClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html

x=df_num.drop('GradeLetter',axis=1)
y=df_num['GradeLetter']

# by hold-out evaluation
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
tree = DecisionTreeClassifier()
bag = BaggingClassifier(tree, n_estimators=100, max_samples=0.8, random_state=1)
clf=bag.fit(x_train, y_train)
y_pred=clf.predict(x_test)
acc=accuracy_score(y_test, y_pred)
pre=precision_score(y_test, y_pred, average='macro')
print('By hold-out evaluation: acc = ',acc, ', precision = ', pre)

# by N-fold cross validation
# Example of randomForest = bagging method of decision trees
tree = DecisionTreeClassifier()
bag = BaggingClassifier(tree, n_estimators=100, max_samples=0.8, random_state=1)
precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(bag, x, y, cv=5, scoring='accuracy').mean()
pre=cross_val_score(bag, x, y, cv=5, scoring=precision).mean()
print("RandomForest Accuracy by N-fold Cross Validation: acc = ",acc, "precision = ", pre)

By hold-out evaluation: acc =  0.6165 , precision =  0.5548728048405906
RandomForest Accuracy by N-fold Cross Validation: acc =  0.5541 precision =  0.5190865760410324


In [4]:
# 03. Bagging
# Bagging can work together with any other classification techniques

from sklearn.naive_bayes import GaussianNB

# by N-fold cross validation
# Example of randomForest = bagging method of decision trees
clf = GaussianNB()
bag = BaggingClassifier(clf, n_estimators=100, max_samples=0.8, random_state=1)
precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(bag, x, y, cv=5, scoring='accuracy').mean()
pre=cross_val_score(bag, x, y, cv=5, scoring=precision).mean()
print("Bagging using GaussianNB Accuracy by N-fold Cross Validation: acc = ",acc, "precision = ", pre)

Bagging using GaussianNB Accuracy by N-fold Cross Validation: acc =  0.6176 precision =  0.5681206436150675


In [9]:
# 04. AdaBoosting

from sklearn.ensemble import AdaBoostClassifier
# API, https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier

tree = DecisionTreeClassifier() # you can also set other classification algorithms
clf = AdaBoostClassifier(tree, n_estimators=100, random_state=0)
precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
pre=cross_val_score(clf, x, y, cv=5, scoring=precision).mean()
print("AdaBoosting Accuracy by N-fold Cross Validation: acc = ",acc, "precision = ", pre)

AdaBoosting Accuracy by N-fold Cross Validation: acc =  0.5114 precision =  0.49098637977527704


In [7]:
# 05. Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier
# API, https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

# GB used tree regressors directly
clf = GradientBoostingClassifier(n_estimators=100, random_state=0, learning_rate=0.2, criterion='squared_error') # you need to tune up learning rate carefully
precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
pre=cross_val_score(clf, x, y, cv=5, scoring=precision).mean()
print("AdaBoosting Accuracy by N-fold Cross Validation: acc = ",acc, "precision = ", pre)


AdaBoosting Accuracy by N-fold Cross Validation: acc =  0.5723 precision =  0.5478895349223463


In [6]:
# 06. XGBoost

# install library first by running "conda install -c conda-forge xgboost"
from xgboost import XGBClassifier
# API, https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier

clf = XGBClassifier()
precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
pre=cross_val_score(clf, x, y, cv=5, scoring=precision).mean()
print("AdaBoosting Accuracy by N-fold Cross Validation: acc = ",acc, "precision = ", pre)


  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


AdaBoosting Accuracy by N-fold Cross Validation: acc =  0.5589999999999999 precision =  0.5362094892996014


In [16]:
# In-Class Practice: using the Loans data for practice and assignments