In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
observations = pd.read_csv("observations.csv")

In [3]:
# Let's put the first 800 observations in our training data.
training_df=observations[0:800]
testing_df=observations[800:]

# And now lets impute the missing data for each set independently
training_df=training_df.fillna(training_df.mean())
testing_df=testing_df.fillna(testing_df.mean())
testing_df.head()

features=training_df.drop('outcome_categorical', axis='columns')
target=training_df['outcome_categorical']

In [4]:
observations.shape

(1188, 62)

In [5]:
features.shape

(800, 61)

In [6]:
target.shape

(800,)

In [7]:
# Converting the Target DataFrame into numerical categories. I.e. "away" to 0 and "home" to 1. This will comvert it to a NumPy Array.
target = training_df['outcome_categorical'].factorize()[0]

# Must convert it from a NumPy Array to a DataFrame
target = pd.DataFrame(target)

In [8]:
# Creating the training variables X_train and y_train.
X_train = features
y_train = target

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
#%run m5p.py

parameters={'max_depth':(3,4,5,6,7,8,9,10), 
            'min_samples_leaf':(1,5,10,15,20,25)}

reg=GridSearchCV(estimator=DecisionTreeClassifier(random_state = 1337), param_grid=parameters, cv=10, scoring='r2')
reg.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=1337),
             param_grid={'max_depth': (3, 4, 5, 6, 7, 8, 9, 10),
                         'min_samples_leaf': (1, 5, 10, 15, 20, 25)},
             scoring='r2')

In [10]:
reg.best_params_

{'max_depth': 7, 'min_samples_leaf': 20}

In [11]:
reg.score

<bound method BaseSearchCV.score of GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=1337),
             param_grid={'max_depth': (3, 4, 5, 6, 7, 8, 9, 10),
                         'min_samples_leaf': (1, 5, 10, 15, 20, 25)},
             scoring='r2')>

In [12]:
from sklearn.metrics import accuracy_score

# Now form a variable which has the correct labels and one which has the predictions
labels=testing_df['outcome_categorical'].factorize()[0]
predictions=reg.predict(testing_df.drop('outcome_categorical', axis='columns'))

accurate = accuracy_score(labels,predictions)

# And let's take a look at our results
print(f"Model Accuracy Score: {accuracy_score(labels,predictions):.4f}")

Model Accuracy Score: 0.5309


In [13]:
X_test=testing_df.drop('outcome_categorical', axis='columns')
y_test=labels

In [15]:
from sklearn.tree import export_graphviz
export_graphviz(reg.best_estimator_, out_file="m5p_tree.dot", 
                feature_names=features.columns, rounded=True, filled=True)

# Now we convert this to a PNG for display and load it up here in a new cell
#!dot -Tpng m5p_tree.dot -o m5p_tree.png
from IPython.display import Image
display(Image('m5p_tree.png'))

FileNotFoundError: No such file or directory: 'm5p_tree.png'

FileNotFoundError: No such file or directory: 'm5p_tree.png'

<IPython.core.display.Image object>