In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import tree,metrics


from IPython.display import Image, display
# from utils import plot_decision

# from sklearn import tree
# import pydotplus
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from IPython.display import Image, display
# from utils import plot_decision

## What are decision trees

<ul>
    <li>Extensive usage of Information Theory</li>
    <li>Decision trees can be regarded as a set of <code>if-then</code> rules.</li>
    <li>Or formally speaking, Decision trees represent a <span class="note">disjunction of conjunctive clauses</span></li>
    <li>A hierarchical data structure that represents data by implementing a <a href="https://en.wiktionary.org/wiki/divide_and_conquer">divide and conquer</a> strategy</li>
    <li>Can be used with continuous or categorical input and target features</li>
    <li>Trees can be applied to both regression and classification problems</li>
    <li>Regression Trees and Decision trees</li>
</ul>

<a href="images/DecisionTree_PlayTennis.png"><img src="images/DecisionTree_PlayTennis.png" alt=""></a>

## Data insights

In [10]:
# used for dataframe styling
def color_df_values(val): 
    if val=='yes':
        color = 'green'
    else:
        color = 'red'
    
    return 'color: {}'.format(color)

In [12]:
df = pd.read_csv('../../../../datasets/PlayTennis/play_tennis_demo.csv')
df.head()

Unnamed: 0,outlook,humidity,windy,play
0,sunny,high,False,no
1,sunny,high,True,no
2,overcast,high,False,yes
3,rainy,high,False,yes
4,rainy,normal,False,yes


In [16]:
# color the values in 'play' column:
styled_df = df.style.applymap(color_df_values,subset=['play'])

# the same, with lambda function:
# styled = df.style.applymap(lambda el: 'color:green' if el=="yes" else 'color:red',subset=['play'])
type(styled_df)

pandas.io.formats.style.Styler

In [232]:
outlook_humidity = df.set_index(['outlook', 'humidity']).sort_values('play')
# outlook_humidity.sort_index()

In [233]:
outlook_windy = df.set_index(['outlook', 'windy']).sort_values('play')
# outlook_windy.sort_index()

## Prepare data

### Categorical attributes to values

In [234]:
attribute_names = df.columns[:-1].values
class_name = df.columns[-1:].values

print(attribute_names)
print(class_name)

# categorize attributes
for column in attribute_names:
    df[column] = pd.factorize(df[column])[0]

# categorize class values
df[class_name] = df[class_name].replace('yes', 1)
df[class_name] = df[class_name].replace('no', 0)
# df

['outlook' 'humidity' 'windy']
['play']


## Split to train/test sets

In [244]:
train, test = train_test_split(df, test_size=0.3, train_size = 0.7)
X_train, y_train = train[attribute_names], train[class_name]
X_test, y_test = test[attribute_names], train[class_name]
X_train

Unnamed: 0,outlook,humidity,windy
6,1,1,1
8,0,1,0
10,0,1,1
9,2,1,0
2,1,0,0
0,0,0,0
1,0,0,1
12,1,1,0
11,1,0,1


## Train the model

In [245]:
# ID3 Algorithm (Entropy)
model = DecisionTreeClassifier(max_depth=3, criterion='entropy')

# gini
# dt = DecisionTreeClassifier(max_depth=3, criterion='gini')


fitted = model.fit(X_train,y_train)

## Make predictions

In [246]:
y_pred = fitted.predict(X_test)

### Viualize decision bounaries

In [250]:
# visualize the model's decision regions
from utils import plot_decision

X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plot_decision(X=X_combined, y=y_combined, classifier=fitted)
plt.legend(loc='upper left')
plt.show()

ImportError: cannot import name 'plot_decision' from 'utils' (/home/nemsys/projects/courses/ProgressBG/ProgressBG-MLwithPython/ProgressBG-MLwithPython-Code/.venv/lib/python3.7/site-packages/utils/__init__.py)

## Evaluate the model performance

In [251]:
# accuracy
# count_misclassified = (y_test != y_pred).sum()
# print('Misclassified samples: {}'.format(count_misclassified))
# accuracy = metrics.accuracy_score(y_test, y_pred)
y_test.shape, y_pred.shape
# print('Accuracy: {:.2f}'.format(accuracy))

((9, 1), (5,))

## Visualize Tree 

In [None]:
def plot_boundary(clf, X, y):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))

    f, ax = plt.subplots(figsize=(10, 8))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    ax.contourf(xx, yy, Z, alpha=0.4)
    ax.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor='k')
    
    plt.show()

# plot_boundary(fitted,X, y)

In [None]:
def vis_tree_exp(fitted):
    dot_data = tree.export_graphviz(fitted, out_file=None, filled=True, rounded=True,
                                feature_names=np.array(attribute_names),  
                                class_names=np.array(['Yes','No']))

    graph = pydotplus.graph_from_dot_data(dot_data)  
    display(Image(graph.create_png()))

In [None]:
def vis_tree(clf):
    dot_data = tree.export_graphviz(clf,
                                feature_names=attribute_names,
                                out_file=None,
                                filled=True,
                                rounded=True)
    graph = pydotplus.graph_from_dot_data(dot_data)

    colors = ('green', 'red')
    edges = collections.defaultdict(list)

    for edge in graph.get_edge_list():
        edges[edge.get_source()].append(int(edge.get_destination()))

    for edge in edges:
        edges[edge].sort()    
        for i in range(2):
            dest = graph.get_node(str(edges[edge][i]))[0]
            dest.set_fillcolor(colors[i])

    graph.write_png('tree.png') 
    
    return graph
#     return graph.Source(dot_data)


In [None]:
def vis_tree_graphviz(clf):
    from IPython.display import SVG
    import pydotplus
    from sklearn.externals.six import StringIO
    
    dot_data = StringIO()
    export_graphviz(
        clf,
        out_file=dot_data,  
        feature_names=np.array(attribute_names),
        class_names=['No Play','Play'],

        filled=True,
        rounded=True,  
        special_characters=True
    )
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    SVG(graph.create(format='svg'))

In [None]:
graph = vis_tree_exp(fitted)
# display(Image(graph.create_png()))


## Confusion Matrix

In [252]:
from sklearn.metrics import confusion_matrix, classification_report

# Compute confusion matrix
test_output = decision_tree_binary_classifier.predict(test[categorical_cols])
cm = confusion_matrix(test.Play, test_output)

print('Confusion matrix')
print(cm)

# Visualize confusion matrix
plt.figure(figsize=(16, 16))
plt.matshow(cm, cmap=plt.get_cmap('Blues'))
# plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

AttributeError: 'DataFrame' object has no attribute 'Play'

In [256]:
# gen confusion matrix
confusion = metrics.confusion_matrix(y_test, y_pred)

TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

ValueError: Found input variables with inconsistent numbers of samples: [9, 5]