In [1]:
# Table of Contents

# 01. Data Preprocessing
# 02. Neural Networks for Classifications
# 03. Neural Networks for Regressions


In [4]:
# 01. Data Preprocessing

import pandas as pd
from sklearn import preprocessing
from IPython.display import display, HTML

df=pd.read_csv('data_students_10k.csv')
print(df.shape)
# strip column names
df=df.rename(columns=lambda x: x.strip())
cols=df.columns

# replace missing values in numerical variables by using mean value #################################
df["Age"].fillna(df["Age"].mean(), inplace=True)
df["Hours on Assignments"].fillna(df["Hours on Assignments"].mean(), inplace=True)
df["Hours on Games"].fillna(df["Hours on Games"].mean(), inplace=True)
df["Exam"].fillna(df["Exam"].mean(), inplace=True)
df["Grade"].fillna(df["Grade"].mean(), inplace=True)

# check again whether there are missing values
print('ColumnName, DataType, MissingValues')
for i in cols:
    print(i, ',', df[i].dtype,',',df[i].isnull().any())
 
# remove column ID and grade which are not appropriate to be included in this classification task
df=df.drop('ID',1)

# encode labels
y = df['GradeLetter'] # define label as nominal values
le = preprocessing.LabelEncoder()
le.fit(y)
y_encoded = le.transform(y) # encode nominal labels to integers #####################################
df['GradeLetter'] = y_encoded

# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))

(10000, 12)
ColumnName, DataType, MissingValues
ID , int64 , False
Nationality , object , False
Gender , int64 , False
Age , int64 , False
Degree , object , False
Hours on Readings , int64 , False
Hours on Assignments , int64 , False
Hours on Games , int64 , False
Hours on Internet , int64 , False
Exam , float64 , False
Grade , float64 , False
GradeLetter , object , False


  df=df.drop('ID',1)


Unnamed: 0,Nationality,Gender,Age,Degree,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,Grade,GradeLetter
0,India,0,25,BS,14,2,14,6,43.67,51.73,3
1,India,0,24,BS,14,2,14,6,62.01,72.23,2
2,India,0,26,BS,14,2,14,6,45.03,54.37,3
3,India,0,21,BS,14,2,14,6,48.86,57.68,3
4,France,1,23,BS,14,2,2,7,80.37,88.41,0
5,Spain,1,18,PHD,12,1,7,4,89.29,89.7,0
6,India,1,22,MS,13,0,13,3,76.64,80.27,1
7,India,1,19,MS,13,0,13,3,89.34,86.9,1
8,India,1,25,MS,13,0,13,3,81.73,78.61,2
9,India,1,18,MS,13,0,13,3,75.28,80.79,1


In [9]:
# convert dataframe to numerical features in order to use Neural Networks

print('Column Datatypes:\n',df.dtypes)

# convert all nominal variables to binary variables
df_num=df.copy(deep=True) 
# create new binary columns
df_dummies=pd.get_dummies(df_num[['Degree','Nationality']])
# add them to dataframe
df_num=df_num.join(df_dummies)
# drop original columns
df_num=df_num.drop('Degree',axis=1)
df_num=df_num.drop('Nationality', axis=1)

# drop extra binary columns, since we only need N-1 binary columns
df_num=df_num.drop('Degree_ BS', axis=1)
df_num=df_num.drop('Nationality_ China', axis=1)

display('df_num:',HTML(df_num.head(10).to_html()))

# prepare data for classification
df_classification = df_num.copy(deep=True) 
df_classification = df_classification.drop('Grade', axis=1)
display('df_classification:',HTML(df_classification.head(10).to_html()))

# prepare data for regression
df_regression = df_num.copy(deep=True) 
df_regression = df_regression.drop('GradeLetter', axis=1)
display('df_regression:',HTML(df_regression.head(10).to_html()))

Column Datatypes:
 Nationality              object
Gender                    int64
Age                       int64
Degree                   object
Hours on Readings         int64
Hours on Assignments      int64
Hours on Games            int64
Hours on Internet         int64
Exam                    float64
Grade                   float64
GradeLetter               int32
dtype: object


'df_num:'

Unnamed: 0,Gender,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,Grade,GradeLetter,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain
0,0,25,14,2,14,6,43.67,51.73,3,0,0,0,1,0
1,0,24,14,2,14,6,62.01,72.23,2,0,0,0,1,0
2,0,26,14,2,14,6,45.03,54.37,3,0,0,0,1,0
3,0,21,14,2,14,6,48.86,57.68,3,0,0,0,1,0
4,1,23,14,2,2,7,80.37,88.41,0,0,0,1,0,0
5,1,18,12,1,7,4,89.29,89.7,0,0,1,0,0,1
6,1,22,13,0,13,3,76.64,80.27,1,1,0,0,1,0
7,1,19,13,0,13,3,89.34,86.9,1,1,0,0,1,0
8,1,25,13,0,13,3,81.73,78.61,2,1,0,0,1,0
9,1,18,13,0,13,3,75.28,80.79,1,1,0,0,1,0


'df_classification:'

Unnamed: 0,Gender,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,GradeLetter,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain
0,0,25,14,2,14,6,43.67,3,0,0,0,1,0
1,0,24,14,2,14,6,62.01,2,0,0,0,1,0
2,0,26,14,2,14,6,45.03,3,0,0,0,1,0
3,0,21,14,2,14,6,48.86,3,0,0,0,1,0
4,1,23,14,2,2,7,80.37,0,0,0,1,0,0
5,1,18,12,1,7,4,89.29,0,0,1,0,0,1
6,1,22,13,0,13,3,76.64,1,1,0,0,1,0
7,1,19,13,0,13,3,89.34,1,1,0,0,1,0
8,1,25,13,0,13,3,81.73,2,1,0,0,1,0
9,1,18,13,0,13,3,75.28,1,1,0,0,1,0


'df_regression:'

Unnamed: 0,Gender,Age,Hours on Readings,Hours on Assignments,Hours on Games,Hours on Internet,Exam,Grade,Degree_ MS,Degree_ PHD,Nationality_ France,Nationality_ India,Nationality_ Spain
0,0,25,14,2,14,6,43.67,51.73,0,0,0,1,0
1,0,24,14,2,14,6,62.01,72.23,0,0,0,1,0
2,0,26,14,2,14,6,45.03,54.37,0,0,0,1,0
3,0,21,14,2,14,6,48.86,57.68,0,0,0,1,0
4,1,23,14,2,2,7,80.37,88.41,0,0,1,0,0
5,1,18,12,1,7,4,89.29,89.7,0,1,0,0,1
6,1,22,13,0,13,3,76.64,80.27,1,0,0,1,0
7,1,19,13,0,13,3,89.34,86.9,1,0,0,1,0
8,1,25,13,0,13,3,81.73,78.61,1,0,0,1,0
9,1,18,13,0,13,3,75.28,80.79,1,0,0,1,0


In [13]:
# 02. Neural Networks for Classifications

# API
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

X = df_classification.drop('GradeLetter', axis=1)
y = df_classification['GradeLetter']

# by hold-out evaluation
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf=MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(200,), random_state=1)
clf=clf.fit(x_train, y_train)
y_pred=clf.predict(x_test)
acc=accuracy_score(y_pred, y_test)
print('Accuracy by hold-out evaluation: ',acc)

# by N-fold cross validation
clf=MLPClassifier(solver='lbfgs', alpha=1e-4,hidden_layer_sizes=(500,), random_state=1)
acc=cross_val_score(clf, X, y, cv=5, scoring='accuracy').mean()
print("Accuracy by N-fold Cross Validation:",acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Accuracy by hold-out evaluation:  0.638


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Accuracy by N-fold Cross Validation: 0.626


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [15]:
# 02. Neural Networks for Regressions

from sklearn.neural_network import MLPRegressor
import numpy as np
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error

# API, https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor

X = df_regression.drop('Grade', axis=1)
y = df_regression['Grade']

# by hold-out evaluation
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
m1=MLPRegressor(random_state=1, max_iter=200)
m1.fit(x_train, y_train)
print("\nBuild mdoels by Neural Networks")
# evaluate models by MAE or RMSE
y_predict=m1.predict(x_test)
MAE=metrics.mean_absolute_error(y_test, y_predict)
RMSE=np.sqrt(metrics.mean_squared_error(y_test, y_predict))
print("\nEvaluations on the test set: \nRMSE: ", RMSE)

# N-fold cross validation
m=MLPRegressor(random_state=1, max_iter=200)
rst=cross_val_score(m, X, y, cv=5, scoring='neg_mean_squared_error')
mse = np.abs(rst.mean())
print("\nN-fold Evaluations: \n RMSE: ", np.sqrt(mse))





Build mdoels by Neural Networks

Evaluations on the test set: 
RMSE:  6.1748426554108296





Evaluations: 
 RMSE:  6.353783441558802


