# 1. Load dataset
#### In this notebook, we are expected to build a decision tree model that classifies a toy dataset.
#### We will need to read the data from the file (data.csv). It contains 15000 samples and two features for each sample.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit

In [2]:
df = pd.read_csv('data.csv', header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2
0,10182.554999,-371.830691,100.0
1,-8493.323486,7009.446179,0.0
2,21322.088204,-390.558362,100.0
3,5473.925002,-1878.223941,0.0
4,-7422.54071,5291.351276,0.0


In [4]:
df.shape

(15000, 3)

# 2. Prepare dataset
#### Split the data into train and test sets.

In [5]:
X = df[[0, 1]]
y = df[2]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 3. Modeling
#### Train a decision tree classifier on the data. We will use DecisionTreeClassifier as well as grid search to tune the hyperparameters.

In [6]:
params = {
    'criterion':['gini','entropy'],
    'max_depth':[3, 5, 7, 9, 11, 13, 15, 20, 30]
}

clf = GridSearchCV(DecisionTreeClassifier(), param_grid=params, cv=5)

#### Train the best model we found on the whole train set and evaluate the model on the test set.

In [7]:
clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 5, 7, 9, 11, 13, 15, 20, 30]})

In [8]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 7}

In [9]:
clf.score(X_test, y_test)

0.8477333333333333

#### Generate 1,200 subsets of the training set, each containing 100 randomly chosen instances using ShuffleSplit.

In [10]:
rs = ShuffleSplit(n_splits=1200, train_size=100, random_state=42)

In [11]:
rs.get_n_splits(X_train)

1200

#### Train one tree on each subset, using the best model we previously found. Evaluate the performance of the trees using the test set. Inspect whether the accuracy will increase or decrease.

In [13]:
total_score = 0
models_list = []

for train_index, test_index in rs.split(X_train):
    # each iteration we will have 100 randomly choses train indexes
    # print(train_index) # Uncomment me for validation  
    clf_ = DecisionTreeClassifier(**clf.best_params_)
    clf_.fit(X_train.iloc[train_index], y_train.iloc[train_index])
    
    # Keep record of each model to use later
    models_list.append(clf_)
    
    total_score += clf_.score(X_test, y_test)

In [14]:
avg_score = total_score/rs.get_n_splits(X)

# Average score decsreased
avg_score

0.7889268888888888

#### For each instance in the test set, predict its class using 1200 trees, and keep only the most frequent prediction. Evaluate these predictions. Did you get lower or higher accuracy?

In [15]:
rows = []

for model in models_list:
    rows.append(model.predict(X_test))

# Datafram that contains each models prediction as its row
all_preds = pd.DataFrame(rows)

In [16]:
all_preds.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3740,3741,3742,3743,3744,3745,3746,3747,3748,3749
0,100.0,0.0,0.0,0.0,0.0,100.0,100.0,100.0,0.0,0.0,...,100.0,100.0,0.0,100.0,100.0,0.0,100.0,100.0,0.0,100.0
1,100.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,0.0,...,0.0,100.0,100.0,100.0,100.0,0.0,100.0,100.0,0.0,100.0
2,100.0,0.0,0.0,0.0,0.0,100.0,100.0,100.0,0.0,0.0,...,100.0,100.0,0.0,100.0,100.0,0.0,100.0,100.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,100.0,0.0,100.0,0.0,0.0,...,0.0,100.0,0.0,100.0,100.0,0.0,100.0,100.0,0.0,100.0
4,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,0.0,...,0.0,100.0,0.0,100.0,100.0,0.0,100.0,100.0,0.0,100.0


In [17]:
all_preds.mode()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3740,3741,3742,3743,3744,3745,3746,3747,3748,3749
0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,0.0,...,100.0,100.0,0.0,100.0,100.0,0.0,100.0,0.0,0.0,0.0


In [18]:
most_frequent_preds = all_preds.mode().iloc[0]

In [19]:
# Get the non-matched entries
different_entries = y_test.reset_index(drop=True).compare(most_frequent_preds)

In [20]:
different_entries.shape

(552, 2)

In [21]:
new_score = (y_test.shape[0] - different_entries.shape[0])/y_test.shape[0]

# Score increased
new_score

0.8528

### We successfully built custom DecisionTreeClassifier using 1200 different tree classifier models and acquired the accuracy of 0.8528.