In [46]:
# Demo from https://stackabuse.com/decision-trees-in-python-with-scikit-learn/
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


In [50]:
dataset = pd.read_csv("az_voter.csv")
dataset.shape


(320, 8)

In [51]:
dataset.describe()

Unnamed: 0,Level of political bias,Closely follow news,Registered to vote,Special interests,Enough information,Surrounded by voters,Impact on personal life,Voter
count,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0
mean,7.7625,2.5,1.26875,2.55625,2.046875,2.134375,10.58125,1.23125
std,2.327396,1.119785,0.444004,2.452679,2.205932,2.483312,16.853219,0.422292
min,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
25%,6.0,2.0,1.0,1.0,1.0,1.0,7.0,1.0
50%,8.0,2.0,1.0,2.0,1.0,1.0,8.0,1.0
75%,10.0,3.0,2.0,2.0,2.0,2.0,10.0,1.0
max,11.0,9.0,2.0,9.0,9.0,9.0,99.0,2.0


In [52]:
mean = dataset['Voter'].mean()

In [62]:
# X variable contains all the columns from the dataset (attributes), except 'Petrol_Consumption' column. 
# Y variable contains values from the 'Petrol_Consumption' column, the labels
X = dataset.drop('Voter', axis=1)
y = dataset['Voter']

# Divide the data sets 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the tree
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)

# Make predictions 
y_pred = regressor.predict(X_test)

df= pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
df


Unnamed: 0,Actual,Predicted
181,1,1.0
258,2,1.0
15,1,1.0
65,1,1.0
216,1,1.0
...,...,...
17,1,1.0
1,1,1.0
8,2,2.0
150,2,2.0


In [54]:
from sklearn import metrics
mean_abs_error = metrics.mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# Is the mean absolute error less than 10 percent of the mean of all the values in the 'Voter' column?
print("Divided by Mean values for 'Voter'", mean_abs_error / mean)

Mean Absolute Error: 0.140625
Mean Squared Error: 0.140625
Root Mean Squared Error: 0.375
Divided by Mean values for 'Voter' 0.11421319796954316


In [75]:
# Try classification

# Train the tree
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

# "fit" trains algorithm on training data
clf = clf.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)


In [76]:
# Evaluate the algorithm
from sklearn.metrics import classification_report, confusion_matrix

# Prints the confusion matrix - (0, 1) and (1, 0) count how many were not classified correctly
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



[[44  7]
 [ 2 11]]
              precision    recall  f1-score   support

           1       0.96      0.86      0.91        51
           2       0.61      0.85      0.71        13

    accuracy                           0.86        64
   macro avg       0.78      0.85      0.81        64
weighted avg       0.89      0.86      0.87        64



In [81]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()

export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

InvocationException: GraphViz's executables not found