In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

from sklearn import linear_model
from pandas.tools.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

In [None]:
# UCI ML database - energy efficiency
UCI_energy = pd.read_excel('https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx')

#^ if you don't have urllib2 working you can download the csv from here: 
#     http://prg.washington.edu/DIRECTfiles/ENB2012_data.csv
# X1	Relative Compactness 
# X2	Surface Area 
# X3	Wall Area 
# X4	Roof Area 
# X5	Overall Height 
# X6	Orientation 
# X7	Glazing Area 
# X8	Glazing Area Distribution 
# y1	Heating Load 
# y2	Cooling Load

In [None]:
UCI_energy.describe()

In [None]:
train,test=train_test_split(UCI_energy,test_size=0.2,random_state=1010)

## Basic decision tree

In [None]:
from sklearn import tree

In [None]:
DT=tree.DecisionTreeRegressor(max_depth=3)
DT.fit(train[['X1','X2','X3','X4','X5','X6','X7','X8']],train.Y1)

### Tree visualization and looking at the parity plot for individual instances 

In [None]:
import os
#conda install -c conda-forge pydotplus=2.0.2
# http://www.webgraphviz.com
import pydotplus 

In [None]:
with open("basic_tree.dot", 'w') as f:
    f = tree.export_graphviz(DT, out_file=f)
    
#http://www.webgraphviz.com
# os.unlink('basic_tree.dot')

dot_data = tree.export_graphviz(DT, out_file=None,feature_names=['X1','X2','X3','X4','X5','X6','X7','X8'])
graph = pydotplus.graph_from_dot_data(dot_data) 
#graph.write_pdf("basic_tree.pdf") 

from IPython.display import SVG, display
display(SVG(graph.create_svg()))


In [None]:
# make predictions on test and train set 
trainpred=DT.predict(train[['X1','X2','X3','X4','X5','X6','X7','X8']])
testpred=DT.predict(test[['X1','X2','X3','X4','X5','X6','X7','X8']])

#parity plot 
plt.figure(figsize=(4,4))
plt.xlim([0,50]);
plt.ylim([0,50]);
plt.scatter(train.Y1,trainpred)
plt.scatter(test.Y1,testpred,color='r')
plt.plot([0,50],[0,50],lw=4,color='black')

#calculate the test and train error
print("Train error",mean_squared_error(train.Y1,trainpred))
print("Test error",mean_squared_error(test.Y1,testpred))

## Test and training set error as a function of maximum tree depth 

In [None]:
trainerror = []
testerror = []

trees = np.arange(1,20,1)
#model=tree.DecisionTreeRegressor()

# loop over lambda values (strength of regularization)
for t in trees:
    model=tree.DecisionTreeRegressor(max_depth=t)
    model.fit(train[['X1','X2','X3','X4','X5','X6','X7','X8']],train.Y1)
    trainerror.append(mean_squared_error(train.Y1,model.predict(
        train[['X1','X2','X3','X4','X5','X6','X7','X8']])))
    testerror.append(mean_squared_error(test.Y1,model.predict(
        test[['X1','X2','X3','X4','X5','X6','X7','X8']])))




In [None]:
plt.figure(figsize=(8,4))
plt.subplot(121)
plt.plot(trees,trainerror,marker='o',label='testerror')
plt.plot(trees,testerror,marker="s",label='trainerror')
plt.legend()
plt.xlabel('Max tree depth')
plt.ylabel('MSE for $Y1$')
plt.subplot(122)
plt.plot(trees,trainerror,marker='o',label='testerror')
plt.plot(trees,testerror,marker="s",label='trainerror')
plt.ylim((0,1))
plt.xlim((5,15))
plt.legend()
plt.xlabel('Max tree depth')
plt.ylabel('MSE for $Y1$')