# Strojové učenie

# Zdroje
[Encoding] - https://medium.com/data-design/visiting-categorical-features-and-encoding-in-decision-trees-53400fa65931

[Stromy] - http://scikit-learn.org/stable/modules/tree.html

# Knižnice

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn

import pydotplus
import collections

import category_encoders as ce
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm

In [2]:
from sklearn.preprocessing import LabelEncoder

In [3]:
#Normalizujeme si kategoricke atributy na cisla
def encodeByNum(data):
    metadata = data.drop(['CLASS'], axis=1)
    columns = metadata.select_dtypes('object').columns
    metadata = pd.get_dummies(metadata, prefix=columns)
    metadata['CLASS'] = data['CLASS']
    return metadata

In [4]:
def init_dataset2(file, sep):
    # Načítame si súbor s datasetom so separátom ',' a ako index nastavime prvý stĺpec
    data = pd.read_csv(file, sep=sep, index_col=0)
    
    metadata = data.drop(['DATE_OF_BIRTH', 'CLASS'], axis=1)
    col_names = metadata.select_dtypes('object').columns
    metadata['CLASS'] = data['CLASS']
    
    #print(metadata.info())
    return metadata

In [5]:
def init_dataset(file):
    # Načítame si súbor s datasetom so separátom ',' a ako index nastavime prvý stĺpec
    data = pd.read_csv(file, sep=',', index_col=0)
    
    metadata = data.drop(['DATE_OF_BIRTH', 'CLASS'], axis=1)
    col_names = metadata.select_dtypes('object').columns
    metadata['CLASS'] = data['CLASS']
    
    #print(metadata.info())
    return metadata

In [6]:
def getDataSets(train_file, test_file):

    train_data = init_dataset(train_file)
    test_data = init_dataset(test_file)

    #Ziskame si pocet trenovacich dat, aby sme vedeli podla, ktoreho indexu ich neskôr rozdelime
    train_rows_count = train_data.shape[0]

    data = pd.concat([train_data, test_data])
    data.reset_index(drop=True)

    #Transformacia kategorickych na numericke
    data = encodeByNum(data)
    
    #Rozdelenie dat naspat na vstupne datsety
    train_data = data.iloc[:train_rows_count,:]
    test_data = data.iloc[train_rows_count+1:,:]
    
    return train_data, test_data

In [7]:
def getFeaturesLabels(data):
    #Rozdelenie dat na atribúty, ktoré budú trénované na predikovanie triedy
    X = data.drop('CLASS', axis=1)
    y = data['CLASS']

    return X, y

In [8]:
#Zistime si dolezitost atributov
def get_important_features_names(data, clf):
    important_features = []
    for name, importance in zip(data.columns, clf.feature_importances_):
        if importance > 0:
            important_features.append(name)
    return important_features

In [9]:
def prediction(train_X, train_y, test_X, test_y, params):
    clf = DecisionTreeClassifier(**params)
    #Natrénujeme model
    clf = clf.fit(train_X, train_y)
    #Predikujeme triedu pre testovacie dáta
    y_pred = clf.predict(test_X)

    print(confusion_matrix(test_y, y_pred))
    print(classification_report(test_y, y_pred))
    return clf

# Doplnkové čistenie
- V tejto časti opisujeme doplnkove cistenie dat na zaklade revizie predchadzajuceho odovzdania

In [10]:
classifiers = pd.DataFrame() # tu sa budu ukladat vysledky jednotlivých klasifikatorov

## Výstupné súbory z fáze predspracovania

In [11]:
file_median_train = 'data/clean_train_data/ctrain_median.csv'
file_modus_train = 'data/clean_train_data/ctrain_modus.csv'
file_mean_train = 'data/clean_train_data/ctrain_mean.csv'
file_knn_train = 'data/clean_train_data/ctrain_knn.csv'

file_median_test = 'data/clean_test_data/ctest_median.csv'
file_modus_test = 'data/clean_test_data/ctest_modus.csv'
file_mean_test = 'data/clean_test_data/ctest_mean.csv'
file_knn_test = 'data/clean_test_data/ctest_knn.csv'

## Inicializácia DataFramov

In [12]:
data_median_train, data_median_test = getDataSets(file_median_train, file_median_test)
data_modus_train, data_modus_test = getDataSets(file_modus_train, file_modus_test)
data_mean_train, data_mean_test = getDataSets(file_mean_train, file_mean_test)
data_knn_train, data_knn_test = getDataSets(file_knn_train, file_knn_test)

## Priprava dát na klasifikáciu

In [13]:
train_median_X, train_median_y = getFeaturesLabels(data_median_train)
test_median_X, test_median_y = getFeaturesLabels(data_median_test)

train_modus_X, train_modus_y = getFeaturesLabels(data_modus_train)
test_modus_X, test_modus_y = getFeaturesLabels(data_modus_test)

train_mean_X, train_mean_y = getFeaturesLabels(data_mean_train)
test_mean_X, test_mean_y = getFeaturesLabels(data_mean_test)

train_knn_X, train_knn_y = getFeaturesLabels(data_knn_train)
test_knn_X, test_knn_y = getFeaturesLabels(data_knn_test)

In [14]:
#Inicializacia datasetu so zjednotenými nazvami
test_file = "data/zadanie4/ctest_revision.csv"
train_file = "data/zadanie4/ctrain.csv"

train_data = init_dataset2(train_file, '\t')
test_data = init_dataset2(test_file, '\t')

#Ziskame si pocet trenovacich dat, aby sme vedeli podla, ktoreho indexu ich neskôr rozdelime
train_rows_count = train_data.shape[0]

data = pd.concat([train_data, test_data])
data.reset_index(drop=True)

#Transformacia kategorickych na numericke
data = encodeByNum(data)

#Rozdelenie dat naspat na vstupne datsety
train_data = data.loc[:train_rows_count+1,:]
test_data = data.loc[train_rows_count:,:]

In [15]:
#Rozdelenie dat na atribúty, ktoré budú trénované na predikovanie triedy
train_X = train_data.drop('CLASS', axis=1)
train_y = train_data['CLASS']
#Rozdelenie testovacich dat na predikciu a porovnanie vysledkov predikcie
test_X = test_data.drop('CLASS', axis=1)
test_y = test_data['CLASS']

# Zistujeme najlepsie parametre a atributy

In [16]:
#Vytvoríme Decision tree
clf = DecisionTreeClassifier(random_state=0)

#Nastavujeme hyperparametre
cv_params = {'max_depth': [1,2,3,4] + list(range(5,10,2)), 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [1,2,3,4] + list(range(5,10,2))}
ind_params = {'random_state': 0}
optimization = GridSearchCV(clf,
                            cv_params, 
                             scoring = 'f1_macro', cv = 5, n_jobs = -1, verbose=True) 

#Inicializacia grid searchu
optimization_median = GridSearchCV(clf, cv_params, scoring = 'f1_macro', cv = 5, n_jobs = -1, verbose=True) 
optimization_modus = GridSearchCV(clf, cv_params, scoring = 'f1_macro', cv = 5, n_jobs = -1, verbose=True) 
optimization_mean = GridSearchCV(clf, cv_params, scoring = 'f1_macro', cv = 5, n_jobs = -1, verbose=True) 
optimization_knn = GridSearchCV(clf, cv_params, scoring = 'f1_macro', cv = 5, n_jobs = -1, verbose=True) 

In [17]:
%%time
optimization.fit(train_X, train_y)

Fitting 5 folds for each of 98 candidates, totalling 490 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    1.7s
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:    6.6s


CPU times: user 1 s, sys: 288 ms, total: 1.29 s
Wall time: 8.35 s


[Parallel(n_jobs=-1)]: Done 490 out of 490 | elapsed:    8.2s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 7, 9], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [1, 2, 3, 4, 5, 7, 9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_macro', verbose=True)

In [18]:
%%time
optimization_median.fit(train_median_X, train_median_y)
optimization_modus.fit(train_modus_X, train_modus_y)
optimization_mean.fit(train_mean_X, train_mean_y)
optimization_knn.fit(train_knn_X, train_knn_y)

Fitting 5 folds for each of 98 candidates, totalling 490 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    1.6s
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:    6.6s


Fitting 5 folds for each of 98 candidates, totalling 490 fits


[Parallel(n_jobs=-1)]: Done 490 out of 490 | elapsed:    8.4s finished
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average,

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    2.6s
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Fitting 5 folds for each of 98 candidates, totalling 490 fits


[Parallel(n_jobs=-1)]: Done 490 out of 490 | elapsed:    8.3s finished
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average,

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    2.5s
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Fitting 5 folds for each of 98 candidates, totalling 490 fits


[Parallel(n_jobs=-1)]: Done 490 out of 490 | elapsed:    7.9s finished
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average,

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    1.6s
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:    6.3s


CPU times: user 3.44 s, sys: 866 ms, total: 4.31 s
Wall time: 33.2 s


[Parallel(n_jobs=-1)]: Done 490 out of 490 | elapsed:    8.1s finished


In [19]:
list(filter(lambda x: 'best' in x, dir(optimization)))

['best_estimator_', 'best_index_', 'best_params_', 'best_score_']

In [20]:
#Vypiseme si najlepsie zistene parametre a score
print('median')
print(optimization_median.best_params_)
print(optimization_median.best_score_)
print('modus')
print(optimization_modus.best_params_)
print(optimization_modus.best_score_)
print('mean')
print(optimization_mean.best_params_)
print(optimization_mean.best_score_)
print('knn')
print(optimization_knn.best_params_)
print(optimization_knn.best_score_)

median
{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 2}
0.8559691847057457
modus
{'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 1}
0.8595600601656542
mean
{'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 1}
0.8468154156657198
knn
{'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 3}
0.8582015526447934


In [21]:
# Zistime si najlepsie parametre pre DecisionTree
best_params = optimization.best_params_

In [22]:
# Zistime najlepsie skore pre nase testovacie data
best_score = optimization.best_score_

In [23]:
best_features_median = get_important_features_names(train_median_X, optimization_median.best_estimator_)
best_features_modus = get_important_features_names(train_modus_X, optimization_modus.best_estimator_)
best_features_mean = get_important_features_names(train_mean_X, optimization_mean.best_estimator_)
best_features_knn = get_important_features_names(train_knn_X, optimization_knn.best_estimator_)

### Model s doplnenými chybajúcimi hodnotami podľa Modus 

In [47]:
# Zistime si najlepsie atributy
best_features = best_features_modus
best_params = optimization_modus.best_params_

print("Best_features: ", best_features)
print("best_params: ", best_params)
#Natrenovanie modelu podla zistených
train_best_data = train_modus_X.loc[:, best_features]
test_best_data = test_modus_X.loc[:, best_features]

result = prediction(train_best_data, train_modus_y, test_best_data, test_modus_y, best_params)
result = prediction(train_modus_X, train_modus_y, test_modus_X, test_modus_y, {})

Best_features:  ['ON_THYROXINE', 'TSH', 'T4U_MEASURED', 'FTI', 'HOURS_PER_WEEK']
best_params:  {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 1}
[[931   1   2   0]
 [  0  10   1   0]
 [  1   5   3   0]
 [  0   0   0  17]]
                     precision    recall  f1-score   support

           NEGATIVE       1.00      1.00      1.00       934
    OVERREPLACEMENT       0.62      0.91      0.74        11
REPLACEMENT_THERAPY       0.50      0.33      0.40         9
   UNDERREPLACEMENT       1.00      1.00      1.00        17

        avg / total       0.99      0.99      0.99       971

[[932   1   1   0]
 [  0  10   1   0]
 [  1   4   4   0]
 [  1   0   0  16]]
                     precision    recall  f1-score   support

           NEGATIVE       1.00      1.00      1.00       934
    OVERREPLACEMENT       0.67      0.91      0.77        11
REPLACEMENT_THERAPY       0.67      0.44      0.53         9
   UNDERREPLACEMENT       1.00      0.94      0.97        17

        avg

### Model s doplnenými chybajúcimi hodnotami podľa KNN 

In [46]:
# Zistime si najlepsie atributy
best_features = best_features_knn
best_params = optimization_knn.best_params_

print("Best_features: ", best_features)
print("best_params: ", best_params)

#Natrenovanie modelu podla zistených
train_best_data = train_knn_X.loc[:, best_features]
test_best_data = test_knn_X.loc[:, best_features]

result = prediction(train_best_data, train_knn_y, test_best_data, test_knn_y, best_params)
result = prediction(train_knn_X, train_knn_y, test_knn_X, test_knn_y, {})

Best_features:  ['TSH', 'T4U', 'FTI', 'ON_THYROXINE', 'T4U_MEASURED']
best_params:  {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 3}
[[931   0   3   0]
 [  0  10   1   0]
 [  1   5   3   0]
 [  0   0   0  17]]
                     precision    recall  f1-score   support

           NEGATIVE       1.00      1.00      1.00       934
    OVERREPLACEMENT       0.67      0.91      0.77        11
REPLACEMENT_THERAPY       0.43      0.33      0.38         9
   UNDERREPLACEMENT       1.00      1.00      1.00        17

        avg / total       0.99      0.99      0.99       971

[[932   1   1   0]
 [  0   9   2   0]
 [  1   4   4   0]
 [  0   0   0  17]]
                     precision    recall  f1-score   support

           NEGATIVE       1.00      1.00      1.00       934
    OVERREPLACEMENT       0.64      0.82      0.72        11
REPLACEMENT_THERAPY       0.57      0.44      0.50         9
   UNDERREPLACEMENT       1.00      1.00      1.00        17

        avg / total   

### Model s doplnenými chybajúcimi hodnotami podľa Mean 

In [45]:
# Zistime si najlepsie atributy
best_features = best_features_mean
best_params = optimization_mean.best_params_

print("Best_features: ", best_features)
print("best_params: ", best_params)

#Natrenovanie modelu podla zistených
train_best_data = train_mean_X.loc[:, best_features]
test_best_data = test_mean_X.loc[:, best_features]

result = prediction(train_best_data, train_mean_y, test_best_data, test_mean_y, best_params)
result = prediction(train_mean_X, train_mean_y, test_mean_X, test_mean_y, {})

Best_features:  ['ON_THYROXINE', 'TSH', 'T4U_MEASURED', 'FTI', 'HOURS_PER_WEEK']
best_params:  {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 1}
[[931   1   2   0]
 [  0  10   1   0]
 [  1   5   3   0]
 [  0   0   0  17]]
                     precision    recall  f1-score   support

           NEGATIVE       1.00      1.00      1.00       934
    OVERREPLACEMENT       0.62      0.91      0.74        11
REPLACEMENT_THERAPY       0.50      0.33      0.40         9
   UNDERREPLACEMENT       1.00      1.00      1.00        17

        avg / total       0.99      0.99      0.99       971

[[931   1   2   0]
 [  0  10   1   0]
 [  1   4   4   0]
 [  1   0   0  16]]
                     precision    recall  f1-score   support

           NEGATIVE       1.00      1.00      1.00       934
    OVERREPLACEMENT       0.67      0.91      0.77        11
REPLACEMENT_THERAPY       0.57      0.44      0.50         9
   UNDERREPLACEMENT       1.00      0.94      0.97        17

        avg

### Model s doplnenými chybajúcimi hodnotami podľa Median 

In [44]:
# Zistime si najlepsie atributy
best_features = best_features_median
best_params = optimization_median.best_params_

print("Best_features: ", best_features)
print("best_params: ", best_params)

#Natrenovanie modelu podla zistených
train_best_data = train_median_X.loc[:, best_features]
test_best_data = test_median_X.loc[:, best_features]

result = prediction(train_best_data, train_median_y, test_best_data, test_median_y, best_params)
result = prediction(train_median_X, train_median_y, test_median_X, test_median_y, {})

Best_features:  ['AGE', 'TSH', 'T3', 'TT4', 'FTI', 'HOURS_PER_WEEK', 'ON_THYROXINE', 'TSH_MEASURED']
best_params:  {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 2}
[[928   2   2   8]
 [  9   0   0   2]
 [  9   1   0   0]
 [  9   0   1   0]]
                     precision    recall  f1-score   support

           NEGATIVE       0.97      0.99      0.98       940
    OVERREPLACEMENT       0.00      0.00      0.00        11
REPLACEMENT_THERAPY       0.00      0.00      0.00        10
   UNDERREPLACEMENT       0.00      0.00      0.00        10

        avg / total       0.94      0.96      0.95       971

[[926   3   2   9]
 [  8   0   0   3]
 [  9   1   0   0]
 [  8   0   1   1]]
                     precision    recall  f1-score   support

           NEGATIVE       0.97      0.99      0.98       940
    OVERREPLACEMENT       0.00      0.00      0.00        11
REPLACEMENT_THERAPY       0.00      0.00      0.00        10
   UNDERREPLACEMENT       0.08      0.10      0.09      

In [28]:
#Vypiseme si najlepsie atributy pre jednotlive predikovanie podla datasetov
print(best_features)

['AGE', 'TSH', 'T3', 'TT4', 'FTI', 'HOURS_PER_WEEK', 'ON_THYROXINE', 'TSH_MEASURED']


In [42]:
#Natrenovanie modelu podla zistených
train_best_data = train_X
test_best_data = test_X
# TODO
result = prediction(train_best_data, train_y, test_best_data, test_y, {})

[[932   2   1   2]
 [  0   9   2   0]
 [  1   4   4   0]
 [  0   0   0  17]]
                     precision    recall  f1-score   support

           NEGATIVE       1.00      0.99      1.00       937
    OVERREPLACEMENT       0.60      0.82      0.69        11
REPLACEMENT_THERAPY       0.57      0.44      0.50         9
   UNDERREPLACEMENT       0.89      1.00      0.94        17

        avg / total       0.99      0.99      0.99       974



In [31]:
test_best_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 974 entries, 2798 to 971
Data columns (total 8 columns):
AGE               974 non-null float64
TSH               974 non-null float64
T3                974 non-null float64
TT4               974 non-null float64
FTI               974 non-null float64
HOURS_PER_WEEK    974 non-null int64
ON_THYROXINE      974 non-null bool
TSH_MEASURED      974 non-null bool
dtypes: bool(2), float64(5), int64(1)
memory usage: 95.2 KB


In [32]:
print("Predikovanie na trenovacich datach: ",result.score(train_best_data, train_y))
print("Predikovanie na testovacich datach: ",result.score(test_best_data, test_y))
classifiers = classifiers.append([["Decision tree", result.score(train_best_data, train_y), result.score(test_best_data, test_y)]])

Predikovanie na trenovacich datach:  0.9960686204431737
Predikovanie na testovacich datach:  0.9887063655030801


In [33]:
# Visualize data
from sklearn.tree import export_graphviz
dot_data = export_graphviz(result, out_file=None, 
                         feature_names=best_features,  
                         class_names=train_y.unique(),  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = graphviz.Source(dot_data)  
graph.render("img/tree_best") 
graph

NameError: name 'graphviz' is not defined

# KNN klasifikátor

In [None]:
cls = KNeighborsClassifier(6)
cls.fit(train_X, train_y)
y_pred_train = cls.predict(train_X)
y_pred_test = cls.predict(test_X)
print(cls)

In [None]:
print("Predikovanie na trenovacich datach: ",accuracy_score(train_y, y_pred_train))
print("Predikovanie na testovacich datach: ",accuracy_score(test_y, y_pred_test))
classifiers = classifiers.append([["KNeighbors", accuracy_score(train_y, y_pred_train), accuracy_score(test_y, y_pred_test)]])

In [None]:
print(confusion_matrix(test_y, y_pred_test))
print(classification_report(test_y, y_pred_test))

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

cls = RandomForestClassifier(n_estimators=100, random_state=0)
cls.fit(train_X, train_y)
print(cls)

In [None]:
print("Accuracy na train zložke: {:f}".format(cls.score(train_X, train_y)))
print("Accuracy na test zložke: {:f}".format(cls.score(test_X, test_y)))


In [None]:
test_preds = cls.predict(test_X)

In [None]:
classifiers = classifiers.append([["Random Forest", accuracy_score(train_y, train_preds), cls.score(test_X, test_y)]])

In [None]:
print(classification_report(test_y, test_preds))

In [None]:
for est in cls.estimators_:
    dot_data = export_graphviz(est, out_file=None,  
                             class_names=train_y.unique(),  
                             filled=True, rounded=True,  
                             special_characters=True)  
    graph = graphviz.Source(dot_data)  
    graph.render("img/random_forest") 

In [None]:
graph

# Porovnanie klasifikátorov

V tejto časti sa budeme venovať porovnaniu výsledkov jednotlivých klasifikátorov. 

In [None]:
classifiers.columns = ['CLASSIFIER_NAME', 'TRAIN_ACCURACY', 'TEST_ACCURACY']
classifiers