In [110]:
# library import
import pandas as pd
import numpy as np
from os.path import join as pjoin
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 100

# preprocessing / validation
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
)
# ML models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# metrics
from sklearn.metrics import classification_report, f1_score
import seaborn as sns

In [111]:
# read data
DATA_DIR = 'data'
df_train = pd.read_csv(pjoin(DATA_DIR, '4-mushrooms-train.csv'), engine='c')
df_test = pd.read_csv(pjoin(DATA_DIR, '4-mushrooms-test.csv'), engine='c')
print(df_train.shape, df_test.shape)

(6499, 23) (1625, 22)


In [112]:
# let's see what data looks like
df_train.head()

Unnamed: 0,target,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,stalk_root,stalk_surface_above_ring,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,0,convex,scaly,brown,bruises,pungent,free,close,narrow,white,enlarging,equal,smooth,smooth,white,white,partial,white,one,pendant,brown,scattered,urban
1,1,flat,fibrous,gray,bruises,none,free,close,broad,brown,tapering,bulbous,smooth,smooth,white,white,partial,white,one,pendant,brown,several,woods
2,0,flat,smooth,brown,no,none,attached,close,broad,orange,enlarging,missing,smooth,smooth,orange,orange,partial,orange,one,pendant,brown,several,leaves
3,1,convex,fibrous,gray,bruises,none,free,close,broad,brown,tapering,bulbous,smooth,smooth,white,white,partial,white,one,pendant,black,solitary,woods
4,0,knobbed,smooth,brown,no,foul,free,close,narrow,buff,tapering,missing,silky,smooth,pink,pink,partial,white,one,evanescent,white,several,paths


In [113]:

# for convenient calculations, let us merge train with test
df = pd.concat([df_train, df_test], axis=0)
# add column for filtering train/test
df['is_train'] = True
df.loc[df.target.isnull(), 'is_train'] = False
# check shapes
print(df.shape)
# check labels
df.is_train.value_counts()

(8124, 24)


True     6499
False    1625
Name: is_train, dtype: int64

In [114]:
count_of_feature=[(column, len(df[column].value_counts())) for column in df.columns]
    

In [115]:
count_of_feature1=[(column, len(df[column].value_counts())) for column in xtest.columns]

In [116]:
count_of_feature.sort(key=lambda x: x[1], reverse=True)
count_of_feature1.sort(key=lambda x: x[1], reverse=True)

In [117]:
[(i[0],i[1])  for i in count_of_feature if i[1]>2]

[('gill_color', 12),
 ('cap_color', 10),
 ('odor', 9),
 ('spore_print_color', 9),
 ('stalk_color_above_ring', 9),
 ('stalk_color_below_ring', 9),
 ('habitat', 7),
 ('cap_shape', 6),
 ('population', 6),
 ('ring_type', 5),
 ('stalk_root', 5),
 ('cap_surface', 4),
 ('stalk_surface_above_ring', 4),
 ('stalk_surface_below_ring', 4),
 ('veil_color', 4),
 ('ring_number', 3)]

In [118]:
[(i[0],i[1])  for i in count_of_feature1]

[('gill_color', 12),
 ('cap_color', 10),
 ('odor', 9),
 ('stalk_color_above_ring', 9),
 ('stalk_color_below_ring', 9),
 ('spore_print_color', 9),
 ('habitat', 7),
 ('cap_shape', 6),
 ('population', 6),
 ('stalk_root', 5),
 ('ring_type', 5),
 ('cap_surface', 4),
 ('stalk_surface_above_ring', 4),
 ('stalk_surface_below_ring', 4),
 ('veil_color', 4),
 ('ring_number', 3),
 ('bruises', 2),
 ('gill_attachment', 2),
 ('gill_spacing', 2),
 ('gill_size', 2),
 ('stalk_shape', 2),
 ('veil_type', 1)]

In [119]:
for column in df.columns:
    print(df[column].value_counts())

no         4748
bruises    3376
Name: bruises, dtype: int64
brown       2284
gray        1840
red         1500
yellow      1072
white       1040
buff         168
pink         144
cinnamon      44
purple        16
green         16
Name: cap_color, dtype: int64
convex     3656
flat       3152
knobbed     828
bell        452
sunken       32
conical       4
Name: cap_shape, dtype: int64
scaly      3244
smooth     2556
fibrous    2320
grooves       4
Name: cap_surface, dtype: int64
free        7914
attached     210
Name: gill_attachment, dtype: int64
buff         1728
pink         1492
white        1202
brown        1048
gray          752
chocolate     732
purple        492
black         408
red            96
yellow         86
orange         64
green          24
Name: gill_color, dtype: int64
broad     5612
narrow    2512
Name: gill_size, dtype: int64
close      6812
crowded    1312
Name: gill_spacing, dtype: int64
woods      3148
grasses    2148
paths      1144
leaves      832
urban       

In [120]:
print(df.shape)
# your code/hardcoded list goes here


(8124, 24)


In [121]:
# ---------------------------------------------------------------
redundant_columns = [
    'veil_type'
]
# ---------------------------------------------------------------
# lets drop these columns from joint dataset
df = df.drop(redundant_columns, axis=1, errors='ignore')
print(df.shape)

(8124, 23)


In [122]:
ordinal_cols = sorted([
    'ring_number'
])
binary_cols = sorted([
    'bruises',
    'gill_attachment',
    'gill_size',
    'gill_spacing',
    'stalk_shape'
])
categorical_cols = sorted([
     'gill_color',
     'cap_color',
     'odor',
     'spore_print_color',
     'stalk_color_above_ring',
     'stalk_color_below_ring',
     'habitat',
     'cap_shape',
     'population',
     'ring_type',
     'stalk_root',
     'cap_surface',
     'stalk_surface_above_ring',
     'stalk_surface_below_ring',
     'veil_color'
])
# ---------------------------------------------------------------
print('categorical: {}\nordinal: {}\nbinary: {}'.format(
    len(categorical_cols), len(ordinal_cols), len(binary_cols)))

categorical: 15
ordinal: 1
binary: 5


In [123]:
# To be used in training, data must be properly encoded
from collections import defaultdict

# function to encode categorical data


def __encode_categorical(df_list, cat_cols):
    # initialize placeholder
    d = defaultdict(LabelEncoder)
    # fit and encode train/test,
    codes = pd.concat(
        [df[cat_cols] for df in df_list],
        axis=0
    ).fillna('').apply(
        lambda x: d[x.name].fit(x)
    ),
    # transform encodings to train/test etc
    for df in df_list:
        df[cat_cols] = df[cat_cols].fillna('').apply(
            lambda x: d[x.name].transform(x))


# label encode data (categorical + binary)
__encode_categorical(df_list=[df], cat_cols=categorical_cols+binary_cols)
# make sure you encode the only ordinal column in correct order
df[ordinal_cols[0]] = df[ordinal_cols[0]].map({'none': 0, 'one': 1, 'two': 2})

# define useful feature columns to be used for training
# (union of all columns discussed above)
columns_to_use = ordinal_cols + binary_cols + categorical_cols

In [124]:
from os import cpu_count

n_jobs = max(cpu_count()-1, 1)
# your code goes here
OHE_categorial=pd.get_dummies(df[categorical_cols[0]], prefix=categorical_cols[0])
#for column in categorical_cols[1:]:
#    OHE1=pd.get_dummies(df[column], prefix=column)
#    OHE_categorial=pd.concat([OHE, OHE1], axis=1)
#df1=pd.concat([df[binary_cols+ordinal_cols],OHE_categorial, df["is_train"]], axis=1)
# cross-validation iterator
kf = StratifiedKFold(shuffle=True)
# xtrain, ytrain, DataFrame-like
#xtrain = df1[df1["is_train"]].drop(["is_train", 'target'],axis=1)
#ytrain = df1[df1["is_train"]]['target']
xtrain = df[df["is_train"]].drop(["is_train", 'target'],axis=1)
ytrain = df[df["is_train"]]['target']
# ---------------------------------------------------------------

# create Decision Tree with default params, max_depth=4, random_state=42
dt = DecisionTreeClassifier(
    max_depth=3, random_state=42
)

# estimate its f1-score with cross-validation (cross_val_score)
# your code goes here
# ---------------------------------------------------------------
scores_dt = cross_val_score(
    estimator=dt,
    X=xtrain, # ...
    y=ytrain, # ...
    scoring='f1', # ...
    cv=kf, # cross-validation strategy
    n_jobs=n_jobs
).mean()
print('DT scoring: {:.4f}'.format(scores_dt))
# ---------------------------------------------------------------


# create Logistic Regression with default params, random_state=42
lr = LogisticRegression(
    random_state=42
)

# estimate its f1-score with cross-validation
# your code goes here
# ---------------------------------------------------------------
scores_lr = cross_val_score(
    estimator=lr,
    X=xtrain, # ...
    y=ytrain, # ...
    scoring='f1',
    cv=kf,
    n_jobs=n_jobs
).mean()
print('LR scoring: {:.2f}'.format(scores_lr))

DT scoring: 0.9019
LR scoring: 0.89


In [125]:
%%time
# your code goes here
# ---------------------------------------------------------------
# create base model (DT, random state = 42)
estimator = DecisionTreeClassifier(
    max_depth=4, random_state=42
)

# create parameter grid
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
params = {
    'max_depth':range(7,10),
    'min_samples_split':range(22,32,1),
    'max_features':[12],
    'max_leaf_nodes':range(18,25),
    'presort': [True, False]
}

# create grid search object
gs = GridSearchCV(
    estimator=estimator,  # base model
    param_grid=params,  # params grid to search within
    cv=kf,  # cross-validation strategy
    error_score=1,  # warnings only
    scoring='f1',  # f1-score
    # thread count, the higher count - the faster
    n_jobs=n_jobs,
    verbose=2,  # messages about performed actions
)

# perform grid search on TRAIN dataset ('is_train' filtering)
gs.fit(
    X=xtrain, # ...
    y=ytrain, # ...
);
# -------------------------------------------------------------
# extract best score on cross-validation
best_score = gs.best_score_;
# extract the estimator (DT) with best params on cross-validation
best_dt = gs.best_estimator_;
# check gain in f1-score
print('f1-score best: {:.4f}, +{:.4f} better than baseline'.format(
    best_score, (best_score - scores_dt))
)

Fitting 3 folds for each of 420 candidates, totalling 1260 fits
[CV] max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=22, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=22, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=22, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=22, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=22, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=22, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=22, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=22, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=22, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=18, min_samp

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV]  max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=23, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=23, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=23, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=23, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=23, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=23, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=23, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=23, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=23, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=23, presort=False 
[CV]  max_depth=7, m

[CV]  max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=30, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=30, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=30, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=30, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=30, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=30, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=30, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=30, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=30, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=18, min_samples_split=30, presort=False 
[CV]  max_depth=7, m

[CV]  max_depth=7, max_features=12, max_leaf_nodes=19, min_samples_split=27, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=19, min_samples_split=27, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=19, min_samples_split=27, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=19, min_samples_split=27, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=19, min_samples_split=27, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=19, min_samples_split=27, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=19, min_samples_split=27, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=19, min_samples_split=27, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=19, min_samples_split=27, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=19, min_samples_split=27, presort=False 
[CV]  max_depth=7, m

[CV]  max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=24, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=24, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=24, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=24, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=24, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=24, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=24, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=24, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=24, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=24, presort=False 
[CV]  max_depth=7, m

[CV]  max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=False 
[CV]  max_depth=7, m

[CV]  max_depth=7, max_features=12, max_leaf_nodes=21, min_samples_split=28, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=21, min_samples_split=28, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=21, min_samples_split=28, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=21, min_samples_split=28, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=21, min_samples_split=28, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=21, min_samples_split=29, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=21, min_samples_split=29, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=21, min_samples_split=29, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=21, min_samples_split=29, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=21, min_samples_split=29, presort=True 
[CV]  max_depth=7, m

[CV]  max_depth=7, max_features=12, max_leaf_nodes=22, min_samples_split=26, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=22, min_samples_split=26, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=22, min_samples_split=26, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=22, min_samples_split=26, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=22, min_samples_split=26, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=22, min_samples_split=26, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=22, min_samples_split=26, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=22, min_samples_split=27, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=22, min_samples_split=27, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=22, min_samples_split=27, presort=True 
[CV]  max_depth=7, 

[CV]  max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=23, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=23, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=23, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=24, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=24, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=24, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=24, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=24, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=24, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=24, presort=False 
[CV]  max_depth=7, ma

[CV] max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=False, total=   0.0s
[CV] max_depth=7, m

[CV]  max_depth=7, max_features=12, max_leaf_nodes=24, min_samples_split=28, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=24, min_samples_split=28, presort=False 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=24, min_samples_split=28, presort=False, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=True 
[CV]  max_depth=7, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=True, total=   0.0s
[CV] max_depth=7, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=False 
[CV]  max_depth=7, ma

[CV]  max_depth=8, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=True 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=True 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=False 
[CV]  max_depth=8, m

[CV]  max_depth=8, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=True 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=True 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=False 
[CV]  max_depth=8, m

[CV]  max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=22, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=22, presort=True 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=22, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=22, presort=True 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=22, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=22, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=22, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=22, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=22, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=22, presort=False 
[CV]  max_depth=8, m

[CV]  max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=29, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=29, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=29, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=29, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=29, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=29, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=29, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=30, presort=True 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=30, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=20, min_samples_split=30, presort=True 
[CV]  max_depth=8, 

[CV]  max_depth=8, max_features=12, max_leaf_nodes=21, min_samples_split=27, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=21, min_samples_split=27, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=21, min_samples_split=27, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=21, min_samples_split=27, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=21, min_samples_split=27, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=21, min_samples_split=27, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=21, min_samples_split=27, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=21, min_samples_split=28, presort=True 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=21, min_samples_split=28, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=21, min_samples_split=28, presort=True 
[CV]  max_depth=8, 

[CV]  max_depth=8, max_features=12, max_leaf_nodes=22, min_samples_split=25, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=22, min_samples_split=25, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=22, min_samples_split=25, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=22, min_samples_split=25, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=22, min_samples_split=25, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=22, min_samples_split=25, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=22, min_samples_split=25, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=22, min_samples_split=26, presort=True 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=22, min_samples_split=26, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=22, min_samples_split=26, presort=True 
[CV]  max_depth=8, 

[CV]  max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=23, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=23, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=23, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=23, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=23, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=23, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=23, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=24, presort=True 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=24, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=24, presort=True 
[CV]  max_depth=8, 

[CV]  max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=True 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=23, min_samples_split=31, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=24, min_samples_split=22, presort=True 
[CV]  max_depth=8, 

[CV]  max_depth=8, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=True 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=True, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=False 
[CV]  max_depth=8, max_features=12, max_leaf_nodes=24, min_samples_split=29, presort=False, total=   0.0s
[CV] max_depth=8, max_features=12, max_leaf_nodes=24, min_samples_split=30, presort=True 
[CV]  max_depth=8, 

[CV] max_depth=9, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=True 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=True 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=18, min_samples_split=27, presort=False, total=   0.0s
[CV] max_depth=9, m

[CV]  max_depth=9, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=True 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=19, min_samples_split=25, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=19, min_samples_split=26, presort=True 
[CV]  max_depth=9, 

[CV] max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=23, presort=True 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=23, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=23, presort=True 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=23, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=23, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=23, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=23, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=23, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=23, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=23, presort=False, total=   0.0s
[CV] max_depth=9, m

[CV]  max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=True 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=True 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=20, min_samples_split=31, presort=False 
[CV]  max_depth=9, m

[CV]  max_depth=9, max_features=12, max_leaf_nodes=21, min_samples_split=29, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=21, min_samples_split=29, presort=True 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=21, min_samples_split=29, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=21, min_samples_split=29, presort=True 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=21, min_samples_split=29, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=21, min_samples_split=29, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=21, min_samples_split=29, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=21, min_samples_split=29, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=21, min_samples_split=29, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=21, min_samples_split=29, presort=False 
[CV]  max_depth=9, m

[CV]  max_depth=9, max_features=12, max_leaf_nodes=22, min_samples_split=27, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=22, min_samples_split=27, presort=True 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=22, min_samples_split=27, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=22, min_samples_split=27, presort=True 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=22, min_samples_split=27, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=22, min_samples_split=27, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=22, min_samples_split=27, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=22, min_samples_split=27, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=22, min_samples_split=27, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=22, min_samples_split=27, presort=False 
[CV]  max_depth=9, m

[CV]  max_depth=9, max_features=12, max_leaf_nodes=23, min_samples_split=25, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=23, min_samples_split=25, presort=True 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=23, min_samples_split=25, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=23, min_samples_split=25, presort=True 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=23, min_samples_split=25, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=23, min_samples_split=25, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=23, min_samples_split=25, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=23, min_samples_split=25, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=23, min_samples_split=25, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=23, min_samples_split=25, presort=False 
[CV]  max_depth=9, m

[CV]  max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=22, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=22, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=22, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=22, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=22, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=22, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=22, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=23, presort=True 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=23, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=23, presort=True 
[CV]  max_depth=9, 

[CV]  max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=30, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=30, presort=True 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=30, presort=True, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=30, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=30, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=30, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=30, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=30, presort=False 
[CV]  max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=30, presort=False, total=   0.0s
[CV] max_depth=9, max_features=12, max_leaf_nodes=24, min_samples_split=31, presort=True 
[CV]  max_depth=9, 

[Parallel(n_jobs=1)]: Done 1260 out of 1260 | elapsed:   24.9s finished


In [126]:
best_dt

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=12, max_leaf_nodes=21, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=25, min_weight_fraction_leaf=0.0,
            presort=True, random_state=42, splitter='best')

In [127]:
# check performance on holdout dataset, unseen before (filter 'is_train' == False)

# your code goes here
# ---------------------------------------------------------------
# appropriate df_test data subset from 'df' dataframe
xtest = df_test 
# fit baseline model 'dt' on xtrain, ytrain (because it's not fitted yet)
dt.fit(xtrain, ytrain)
# ---------------------------------------------------------------

# baseline model
y_true = pd.read_csv(pjoin(DATA_DIR, '4-mushrooms-y_test.csv'))
y_pred_baseline = dt.predict_proba(xtest)

print('Base on train:   {:.4f}\nBase on holdout: {:.4f}\ndiff: {:.4f}'.format(
    scores_dt, 
    f1_score(y_true, y_pred_baseline),
    scores_dt - f1_score(y_true, y_pred_baseline)
))
dt1=DecisionTreeClassifier(
    max_depth=9, 
    random_state=42,
    max_features=12,
    max_leaf_nodes=24,
    min_samples_split=29,
    presort=True
    
)
# best model
y_pred_best = dt1.predict_proba(xtest)

print('\nBest on train:   {:.4f}\nBest on holdout: {:.4f}\ndiff: {:.4f}'.format(
    best_score, 
    f1_score(y_true, y_pred_best),
    best_score - f1_score(y_true, y_pred_best)
))

ValueError: could not convert string to float: 'woods'