In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

students_math = pd.read_csv("./studentmat.csv")
students_port = pd.read_csv("./studentpor.csv")

In [2]:
#list will list all column titles
#list(students_math)

#.shape will give us number of rows, number of columns
students_math.shape


(395, 33)

In [3]:
#put both data frames in a single list
all_students_rows = [students_math, students_port]

#then concatenate the lists
all_students = pd.concat(all_students_rows, ignore_index=True)

In [4]:
all_students.shape

(1044, 33)

In [5]:
#using an educated guess, lets select some of the variables from the full list
x= all_students[["age", "address", "traveltime", "failures", "higher", 
                 "internet", "romantic", "famrel", "freetime", "goout", "absences"]].values

#import a label encoder to handle string entries in the data frame
from sklearn.preprocessing import LabelEncoder
discreteCoder_x = LabelEncoder()

In [6]:

x[:,1] = discreteCoder_x.fit_transform(x[:,1])
#this means for just column one (:1) not first column (column zero)...
#use the discreteCoder, fit and transform that data and return it to that column one position

#we need to do the same process for the other non-numeric values in the data
x[:,4] = discreteCoder_x.fit_transform(x[:,4])
x[:,5] = discreteCoder_x.fit_transform(x[:,5])
x[:,6] = discreteCoder_x.fit_transform(x[:,6])

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#split out the predictive variable
y = all_students[["Walc"]].values 

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=1693)

In [9]:
scale_x = StandardScaler()
x_train = scale_x.fit_transform(x_train)
x_test = scale_x.transform(x_test)



In [10]:
from sklearn.svm import SVR

In [11]:
#for a svr we have to pick to options, shape...kernal, and error to ignore...epsilon
svr_regression = SVR(kernel = "linear", epsilon = 1.0)
svr_regression.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=1.0, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [12]:
#We are going to create a hypothetical student to compare our model against

#Student A
#Age 18
#Address: Urban (label encoded as 1)
#Travel Time: 3 (30 minutes to 1 hour)
#Failures: 3
#Desire for Higher Ed: No (0)
#Internet Access: No (0)
#Romantic Relationship: Yes (1)
#Relationship with Family: OK (2 out of 5)
#Freetime: A lot (5 out of 5)
#Going Out: A bit (2 out of 5)
#Absences: 5

new_studentA = [[18, 1, 3, 3, 0, 0, 1, 2, 5, 2, 5]]

In [13]:
#we need to transform the new student data in the same manner as we scaled the training data
new_student_scaledA = scale_x.transform(new_studentA)
studentA_prediction = svr_regression.predict(new_student_scaledA)

In [14]:
#we can see what our model predicts for this test student
studentA_prediction

array([2.76152033])

In [15]:
print("First new student (A):" + str(studentA_prediction))

First new student (A):[2.76152033]


In [16]:
#Student B
#Age 18
#Address: Rural (label encoded as 0)
#Travel Time: 3 (30 minutes to 1 hour)
#Failures: 3
#Desire for Hiegher Ed: No (0)
#Internet Access: No (0)
#Romantic Relationship: Yes (1)
#Relationship with Family: OK (2 out of 5)
#Freetime: very little (1 out of 5)
#Going Out: very little (1 out of 5)
#Absences: 5
new_studentB = [[18, 0, 3, 3, 0, 0, 1, 2, 1, 1, 5]]
new_student_scaledB = scale_x.transform(new_studentB)
studentB_prediction = svr_regression.predict(new_student_scaledB)
print("First new student (B):" + str(studentB_prediction))

First new student (B):[2.31851134]


In [17]:
from sklearn import tree

In [18]:
DT_regression = tree.DecisionTreeRegressor(random_state = 1693, max_depth = 3)
DT_regression.fit(x_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1693, splitter='best')

In [19]:
#visualize the tree
tree.export_graphviz(DT_regression, out_file="tree.dot", feature_names=["age", "address", "traveltime", 
              "failures", "higher", "internet", "romantic", "famrel", "freetime", "goout", "absences"])

#http://dreampuf.github.io/GraphvizOnline/

In [20]:
studentA_prediction_RT = DT_regression.predict(new_student_scaledA)
print("First new student:"+ str(studentA_prediction_RT))

First new student:[1.68644068]


In [21]:
studentB_prediction_RT = DT_regression.predict(new_student_scaledB)
print("Second new student:"+ str(studentB_prediction_RT))

Second new student:[1.68644068]


In [22]:
from sklearn.ensemble import RandomForestRegressor

#with forests, you have to specifiy how many trees...n_estimators
RF_regression = RandomForestRegressor(n_estimators = 100, random_state=1693)
RF_regression.fit(x_train, y_train)

  """


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=1693, verbose=0, warm_start=False)

In [23]:
studentA_prediction_RF = RF_regression.predict(new_student_scaledA)
print("First new student:" + str(studentA_prediction_RF))

First new student:[1.89]


In [24]:
studentB_prediction_RF = RF_regression.predict(new_student_scaledB)
print("Second new student:" + str(studentB_prediction_RF))

Second new student:[1.45]


In [25]:
from sklearn.metrics import mean_absolute_error

#find the mean error (difference) between the true y (y_test), and our forest prediction
rf_MAD = mean_absolute_error(y_test, RF_regression.predict(x_test))

#we can look at the differene between our forest and the truth...in this case we were off
#by 0.85699
rf_MAD

0.8569979778629203

In [26]:
#Lets look at the other models
RT_MAD = mean_absolute_error(y_test, DT_regression.predict(x_test))
SVR_MAD = mean_absolute_error(y_test, svr_regression.predict(x_test))

In [27]:
print("Random Forest MAD: " + str(rf_MAD))
print("Regression Tree MAD: " + str(RT_MAD))
print("Support Vector Regression MAD: " + str(SVR_MAD))

Random Forest MAD: 0.8569979778629203
Regression Tree MAD: 0.9637573254432173
Support Vector Regression MAD: 0.9614216792246265


In [28]:
#we see that Random Forest has a relative lower error than the other models

In [31]:
#Lab questions
#Student C
#Age 20
#Address: Urban (label encoded as 1)
#Travel Time: 3 (30 minutes to 1 hour)
#Failures: 1
#Desire for Higher Ed: No (0)
#Internet Access: Yes (1)
#Romantic Relationship: Yes (1)
#Relationship with Family: OK (2 out of 5)
#Freetime: a little (3 out of 5)
#Going Out: a bit (2 out of 5)
#Absences: 5

new_studentC = [[20, 1, 3, 1, 0, 1, 1, 2, 3, 2, 5]]
new_student_scaledC = scale_x.transform(new_studentC)
studentC_prediction_RF = RF_regression.predict(new_student_scaledC)
print("Third (lab RF) new student:"+ str(studentC_prediction_RF))



Third (lab) new student:[1.77]


In [32]:
#Student D
#Age 20
#Address: Urban (label encoded as 1)
#Travel Time: 3 (30 minutes to 1 hour)
#Failures: 1
#Desire for Higher Ed: No (0)
#Internet Access: Yes (1)
#Romantic Relationship: Yes (1)
#Relationship with Family: OK (2 out of 5)
#Freetime: a little (3 out of 5)
#Going Out: a bit (2 out of 5)
#Absences: 5

new_studentD = [[20, 1, 3, 1, 0, 1, 1, 2, 3, 2, 5]]
new_student_scaledD = scale_x.transform(new_studentD)
studentD_prediction_SVR = svr_regression.predict(new_student_scaledD)
print("Fourth (lab SVR) new student:"+ str(studentD_prediction_SVR))

Fourth (lab SVR) new student:[2.50416303]
