In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ggplot

In [40]:
data_df = pd.read_csv('data.csv', nrows=None).iloc[:, 1:]
print('data_df shape:', data_df.shape)

data_df shape: (75, 84)


In [41]:
X, y = data_df.drop('target', axis=1), data_df['target']
print('X shape: {}, y shape: {}'.format(X.shape, y.shape))


X shape: (75, 83), y shape: (75,)


In [42]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, stratify=y_train_val, random_state=42)
print('X_train shape: {}, X_val shape: {}, X_test shape: {}'.format(X_train.shape, X_val.shape, X_test.shape))


X_train shape: (48, 83), X_val shape: (12, 83), X_test shape: (15, 83)


In [43]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=10)


In [44]:
from sklearn.metrics import classification_report

random_forest.fit(X_train, y_train)
print('Train score: {:.4f}'.format(random_forest.score(X_train, y_train)))
print(classification_report(y_train, random_forest.predict(X_train), digits=4))
print('Validation score: {:.4f}'.format(random_forest.score(X_val, y_val)))
print(classification_report(y_val, random_forest.predict(X_val), digits=4))


Train score: 1.0000
                precision    recall  f1-score   support

palais_justice     1.0000    1.0000    1.0000        15
    place_ange     1.0000    1.0000    1.0000        13
  saint_aubain     1.0000    1.0000    1.0000        14
       theatre     1.0000    1.0000    1.0000         6

     micro avg     1.0000    1.0000    1.0000        48
     macro avg     1.0000    1.0000    1.0000        48
  weighted avg     1.0000    1.0000    1.0000        48

Validation score: 1.0000
                precision    recall  f1-score   support

palais_justice     1.0000    1.0000    1.0000         4
    place_ange     1.0000    1.0000    1.0000         3
  saint_aubain     1.0000    1.0000    1.0000         4
       theatre     1.0000    1.0000    1.0000         1

     micro avg     1.0000    1.0000    1.0000        12
     macro avg     1.0000    1.0000    1.0000        12
  weighted avg     1.0000    1.0000    1.0000        12



In [45]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)


In [46]:
from sklearn.metrics import classification_report

knn.fit(X_train, y_train)
print('Train score: {:.4f}'.format(knn.score(X_train, y_train)))
print(classification_report(y_train, knn.predict(X_train), digits=4))
print('Validation score: {:.4f}'.format(knn.score(X_val, y_val)))
print(classification_report(y_val, knn.predict(X_val), digits=4))


Train score: 0.9375
                precision    recall  f1-score   support

palais_justice     0.8750    0.9333    0.9032        15
    place_ange     1.0000    0.9231    0.9600        13
  saint_aubain     1.0000    1.0000    1.0000        14
       theatre     0.8333    0.8333    0.8333         6

     micro avg     0.9375    0.9375    0.9375        48
     macro avg     0.9271    0.9224    0.9241        48
  weighted avg     0.9401    0.9375    0.9381        48

Validation score: 1.0000
                precision    recall  f1-score   support

palais_justice     1.0000    1.0000    1.0000         4
    place_ange     1.0000    1.0000    1.0000         3
  saint_aubain     1.0000    1.0000    1.0000         4
       theatre     1.0000    1.0000    1.0000         1

     micro avg     1.0000    1.0000    1.0000        12
     macro avg     1.0000    1.0000    1.0000        12
  weighted avg     1.0000    1.0000    1.0000        12



In [78]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(max_depth=20)


In [79]:
from sklearn.metrics import classification_report

decision_tree.fit(X_train, y_train)
print('Train score: {:.4f}'.format(decision_tree.score(X_train, y_train)))
print(classification_report(y_train, decision_tree.predict(X_train), digits=4))
print('Validation score: {:.4f}'.format(decision_tree.score(X_val, y_val)))
print(classification_report(y_val, decision_tree.predict(X_val), digits=4))


Train score: 1.0000
                precision    recall  f1-score   support

palais_justice     1.0000    1.0000    1.0000        15
    place_ange     1.0000    1.0000    1.0000        13
  saint_aubain     1.0000    1.0000    1.0000        14
       theatre     1.0000    1.0000    1.0000         6

     micro avg     1.0000    1.0000    1.0000        48
     macro avg     1.0000    1.0000    1.0000        48
  weighted avg     1.0000    1.0000    1.0000        48

Validation score: 1.0000
                precision    recall  f1-score   support

palais_justice     1.0000    1.0000    1.0000         4
    place_ange     1.0000    1.0000    1.0000         3
  saint_aubain     1.0000    1.0000    1.0000         4
       theatre     1.0000    1.0000    1.0000         1

     micro avg     1.0000    1.0000    1.0000        12
     macro avg     1.0000    1.0000    1.0000        12
  weighted avg     1.0000    1.0000    1.0000        12



In [111]:
def DT_to_PNG(model, feature_names, file_name):
    """ Exports a DT to a PNG image file for inspection.
    
    Parameters
    ----------
        - model: a decision tree (class sklearn.tree.DecisionTreeClassifier)
        - feature_names: a list of feature names
        - file_name: name of file to be produced (without '.png' extension)
    
    Notes
    -----
    This function requires the pydot Python package and the Graphviz library.
    
    For more information about tree export, see http://scikit-learn.org/stable/
    modules/generated/sklearn.tree.export_graphviz.html#sklearn.tree.export_graphviz
    
    """

    import pydot
    import string
    from sklearn import tree
    from sklearn.externals.six import StringIO
    from IPython.display import SVG
    from IPython.display import display

        
    dot_data = StringIO()
    tree.export_graphviz(model, out_file=dot_data, feature_names=feature_names, class_names=feature_names, filled = True)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())[0]
    graph.write_png('%s.png' % file_name)
    

In [114]:
import datetime

features = list(data_df.columns)
features.remove('target')
DT_to_PNG(decision_tree,features, "image_" + str(datetime.datetime.now()))

In [115]:
decision_tree2 = DecisionTreeClassifier(max_depth=20)
decision_tree2.fit(X, y)
DT_to_PNG(decision_tree2,features,  "image_" + str(datetime.datetime.now()))