In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("meta-dataset.csv")

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,beta_0,beta_1,beta_2,beta_3,beta_4,beta_5,beta_6,beta_7,label
0,0,0.116358,0.134907,0.278246,0.470489,0.0,0.0,0.0,0.0,1
1,1,0.096939,0.210459,0.269133,0.423469,0.0,0.0,0.0,0.0,1
2,2,0.067696,0.157957,0.483373,0.290974,0.0,0.0,0.0,0.0,1


In [4]:
# Split into X and y
X = df.drop(columns = ["Unnamed: 0", "label"])
y = df["label"]

In [5]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=123, shuffle=True)

In [6]:
clf = DecisionTreeClassifier(criterion="entropy", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Model accuracy score with criterion entropy: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with criterion entropy: 0.5882


In [7]:
# Hyperparameter tuning
params = {
    'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 20, None],
    'criterion': ["gini", "entropy"],
    'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 50],
    'class_weight' : ["balanced",None],
}

In [8]:
dt = DecisionTreeClassifier(random_state=42)

In [9]:
grid_search = GridSearchCV(estimator=dt, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

In [10]:
grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 960 candidates, totalling 3840 fits


GridSearchCV(cv=4, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                       14, 20, None],
                         'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                               12, 13, 14, 15, 20, 50]},
             scoring='accuracy', verbose=1)

In [11]:
grid_search.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=2, random_state=42)

In [12]:
clf_2 = DecisionTreeClassifier(criterion='entropy', max_depth=2, random_state=42)
clf_2.fit(X_train, y_train)
y_pred_2 = clf_2.predict(X_test)
print("Train Accuracy : {}", accuracy_score(y_train, grid_search.predict(X_train)));
print('Test accuracy  : {}', accuracy_score(y_test, y_pred_2))

Train Accuracy : {} 0.6666666666666666
Test accuracy  : {} 0.7647058823529411
