In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error

In [3]:
# Import dataset
students = pd.read_csv('student-mat.csv', sep=";")

In [4]:
# Drop G1 and G2 because they will not be considered
students = students.drop(columns = ["G1", "G2"])

In [5]:
# Start looking for correlation: factorize categorical data
students_factorized = students
students_factorized[["school", "sex", "address", "famsize", "Pstatus", 
                     "Mjob", "Fjob", "reason", "guardian", "schoolsup", 
                     "famsup", "paid", "activities", "nursery",
                     "higher", "internet", "romantic"]] = students_factorized[["school", "sex", "address", "famsize",
                                                                               "Pstatus", "Mjob", "Fjob", "reason",
                                                                               "guardian", "schoolsup", "famsup", "paid",
                                                                               "activities", "nursery", "higher", "internet",
                                                                               "romantic"]].apply(lambda x: pd.factorize(x)[0])

In [6]:
# Run correlation on dataset
students_correlation = students_factorized.corr()

In [7]:
# Get only G3 and order by highest to lowest
g3corr = students_correlation.sort_values(by = "G3", axis = 1, ascending = False)
g3corr.iloc[[30]]

Unnamed: 0,G3,Medu,Fedu,sex,Mjob,paid,reason,internet,studytime,schoolsup,...,Dalc,Pstatus,health,address,traveltime,romantic,goout,age,higher,failures
G3,1.0,0.217147,0.152457,0.103456,0.102082,0.101996,0.099773,0.098483,0.09782,0.082788,...,-0.05466,-0.058009,-0.061335,-0.105756,-0.117142,-0.12997,-0.132791,-0.161579,-0.182465,-0.360415


In [8]:
# A quick check shows that failures (-0.36), mother's education (0.217), and plans for higher education (-0.18) are strongest
g3corr.iloc[[30]]

Unnamed: 0,G3,Medu,Fedu,sex,Mjob,paid,reason,internet,studytime,schoolsup,...,Dalc,Pstatus,health,address,traveltime,romantic,goout,age,higher,failures
G3,1.0,0.217147,0.152457,0.103456,0.102082,0.101996,0.099773,0.098483,0.09782,0.082788,...,-0.05466,-0.058009,-0.061335,-0.105756,-0.117142,-0.12997,-0.132791,-0.161579,-0.182465,-0.360415


In [9]:
### Decision Tree Regressor model ###

In [10]:
X, y = students.drop(columns = ["G3"]), students[["G3"]]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [12]:
# pipelines for numeric attributes and categorical attributes
num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scalar", StandardScaler())])
cat_pipe = OneHotEncoder(handle_unknown="ignore")

In [13]:
### Bagging Regressor Ensemble #### Declare numeric attributes and categorical attributes
num_attributes = list(students.drop(columns = ["school", "sex", "address", "famsize", "Pstatus",
                              "Mjob", "Fjob", "reason", "guardian", "schoolsup",
                              "famsup", "paid", "activities", "nursery", "higher",
                              "internet", "romantic", "G3"]))
cat_attributes = ["school", "sex", "address", "famsize", "Pstatus",
                  "Mjob", "Fjob", "reason", "guardian", "schoolsup",
                  "famsup", "paid", "activities", "nursery", "higher",
                  "internet", "romantic"]

In [14]:
# Combined pipeline
full_pipe = ColumnTransformer(transformers=[
                                ("num", num_pipe, num_attributes),
                                ("cat", cat_pipe, cat_attributes)])

In [15]:
# Set model using the pipeline, runs both num and cat attributes through model
tree_reg = Pipeline(steps=[("preprocessor", full_pipe), ("regressor", DecisionTreeRegressor(max_depth=5))])

In [16]:
# Split attributes and target
X, y = students.drop(columns = ["G3"]), students[["G3"]]

In [17]:
# split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [18]:
# Train model
tree_reg.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scalar',
                                                                   StandardScaler())]),
                                                  ['age', 'Medu', 'Fedu',
                                                   'traveltime', 'studytime',
                                                   'failures', 'famrel',
                                                   'freetime', 'goout', 'Dalc',
                                                   'Walc', 'health',
                                                   'absences']),
                                                 ('cat',
                                                  OneHotEncoder(handl

In [19]:
# Grab some random student to run a prediction on
some_student_1 = X.iloc[[30]]
tree_reg.predict(some_student_1)

array([12.74074074])

In [20]:
# Student is indeed close to 12 for G3
students.iloc[[30]]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
30,0,1,15,0,0,1,4,4,1,2,...,1,0,5,4,2,3,4,5,0,12


In [21]:
# Try it again with another student

In [22]:
some_student_1 = X.iloc[[178]]
tree_reg.predict(some_student_1)

array([11.53284672])

In [23]:
# Not quite as accurate, but still close
students.iloc[[178]]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
178,0,1,16,1,0,1,4,2,4,2,...,1,1,4,3,3,3,4,3,10,9


In [24]:
# Give it one more try
some_student_1 = X.iloc[[247]]
tree_reg.predict(some_student_1)

array([5.66666667])

In [25]:
# Again, not quite as accurate
students.iloc[[247]]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
247,0,1,22,0,0,1,3,1,3,2,...,1,1,5,4,5,5,5,1,16,8


In [26]:
# Get a cross val prediciton on training set for comparision
y_train_pred = cross_val_predict(tree_reg, X_train, y_train, cv = 3)

In [27]:
# MSE on training data
# pretty high error
mean_squared_error(y_train, y_train_pred)

22.64110463010853

In [28]:
# Make predictions on test set
y_pred = tree_reg.predict(X_test)

In [29]:
# MSE on training set
# The errors are really high, I am guessing that this is due to low correlation between attributes and G3
mean_squared_error(y_test, y_pred)

27.83787809491604

In [30]:
### Bagging Regressor Ensemble ###

In [36]:
# Split attributes and target
X, y = students.drop(columns = ["G3"]), students[["G3"]]

In [37]:
# split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [38]:
# pipelines for numeric attributes and categorical attributes
num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scalar", StandardScaler())])
cat_pipe = OneHotEncoder(handle_unknown="ignore")

In [39]:
### Bagging Regressor Ensemble #### Declare numeric attributes and categorical attributes
num_attributes = list(students.drop(columns = ["school", "sex", "address", "famsize", "Pstatus",
                              "Mjob", "Fjob", "reason", "guardian", "schoolsup",
                              "famsup", "paid", "activities", "nursery", "higher",
                              "internet", "romantic", "G3"]))
cat_attributes = ["school", "sex", "address", "famsize", "Pstatus",
                  "Mjob", "Fjob", "reason", "guardian", "schoolsup",
                  "famsup", "paid", "activities", "nursery", "higher",
                  "internet", "romantic"]

In [40]:
# Combined pipeline
full_pipe = ColumnTransformer(transformers=[
                                ("num", num_pipe, num_attributes),
                                ("cat", cat_pipe, cat_attributes)])

In [41]:
# Set model using the pipeline, runs both num and cat attributes through model
bagging_reg = Pipeline(steps=[("preprocessor", full_pipe), ("regressor", BaggingRegressor())])

In [43]:
# Train baggingregressor model
bagging_reg.fit(X_train, y_train)

  return f(*args, **kwargs)


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scalar',
                                                                   StandardScaler())]),
                                                  ['age', 'Medu', 'Fedu',
                                                   'traveltime', 'studytime',
                                                   'failures', 'famrel',
                                                   'freetime', 'goout', 'Dalc',
                                                   'Walc', 'health',
                                                   'absences']),
                                                 ('cat',
                                                  OneHotEncoder(handl

In [45]:
# Grab some random student to run a prediction on
some_student_1 = X.iloc[[30]]
bagging_reg.predict(some_student_1)

array([10.8])

In [20]:
# Not quite as close as the DTR model
students.iloc[[30]]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
30,0,1,15,0,0,1,4,4,1,2,...,1,0,5,4,2,3,4,5,0,12


In [48]:
# Try it again with another student
some_student_1 = X.iloc[[178]]
bagging_reg.predict(some_student_1)

array([9.])

In [49]:
# Much more accurate
students.iloc[[178]]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
178,0,1,16,1,0,1,4,2,4,2,...,1,1,4,3,3,3,4,3,10,9


In [50]:
# Give it one more try
some_student_1 = X.iloc[[247]]
bagging_reg.predict(some_student_1)

array([8.2])

In [51]:
# Again, much more accurate than the DTR model
students.iloc[[247]]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
247,0,1,22,0,0,1,3,1,3,2,...,1,1,5,4,5,5,5,1,16,8


In [52]:
# Get a cross val prediciton on training set for comparision
y_train_pred = cross_val_predict(bagging_reg, X_train, y_train, cv = 3)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [53]:
# MSE on training data
# pretty high error, but lower than the DTR
mean_squared_error(y_train, y_train_pred)

16.649549295774648

In [54]:
# Make predictions on test set
y_pred = bagging_reg.predict(X_test)

In [55]:
# MSE on training set
# The errors are really high, but still lower than the DTR
# I am guessing the lowering is due to the bagging model's ability to deal with higher levels of variance
mean_squared_error(y_test, y_pred)

20.095999999999997