# Strojové učenie

# Zdroje
[Encoding] - https://medium.com/data-design/visiting-categorical-features-and-encoding-in-decision-trees-53400fa65931

[Stromy] - http://scikit-learn.org/stable/modules/tree.html

# Knižnice

In [195]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn

import pydotplus
import collections

import category_encoders as ce
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm

In [196]:
from sklearn.preprocessing import LabelEncoder

In [197]:
#Normalizujeme si kategoricke atributy na cisla
def encodeByNum(data):
    metadata = data.drop(['CLASS'], axis=1)
    columns = metadata.select_dtypes('object').columns
    metadata = pd.get_dummies(metadata, prefix=columns)
    metadata['CLASS'] = data['CLASS']
    return metadata

In [198]:
def init_dataset(file):
    # Načítame si súbor s datasetom so separátom ',' a ako index nastavime prvý stĺpec
    data = pd.read_csv(file, sep='\t', index_col=0)
    
    metadata = data.drop(['DATE_OF_BIRTH', 'CLASS'], axis=1)
    col_names = metadata.select_dtypes('object').columns
    metadata['CLASS'] = data['CLASS']
    
    #print(metadata.info())
    return metadata

In [199]:
#Zistime si dolezitost atributov
def get_important_features_names(data, clf):
    important_features = []
    for name, importance in zip(data.columns, clf.feature_importances_):
        if importance > 0:
            important_features.append(name)
    return important_features

In [200]:
def prediction(train_X, train_y, test_X, test_y, params):
    clf = DecisionTreeClassifier(**params)
    #Natrénujeme model
    clf = clf.fit(train_X, train_y)
    #Predikujeme triedu pre testovacie dáta
    y_pred = clf.predict(test_X)

    print(confusion_matrix(test_y, y_pred))
    print(classification_report(test_y, y_pred))
    return clf

# Doplnkové čistenie
- V tejto časti opisujeme doplnkove cistenie dat na zaklade revizie predchadzajuceho odovzdania

In [226]:
classifiers = pd.DataFrame() # tu sa budu ukladat vysledky jednotlivých klasifikatorov

## Transformacia kategorickych na numerické

In [227]:
#Inicializacia datasetu so zjednotenými nazvami
test_file = "data/zadanie4/ctest_revision.csv"
train_file = "data/zadanie4/ctrain.csv"

train_data = init_dataset(train_file)
test_data = init_dataset(test_file)

#Ziskame si pocet trenovacich dat, aby sme vedeli podla, ktoreho indexu ich neskôr rozdelime
train_rows_count = train_data.shape[0]

data = pd.concat([train_data, test_data])
data.reset_index(drop=True)

#Transformacia kategorickych na numericke
data = encodeByNum(data)

#Rozdelenie dat naspat na vstupne datsety
train_data = data.loc[:train_rows_count+1,:]
test_data = data.loc[train_rows_count:,:]

In [228]:
#Rozdelenie dat na atribúty, ktoré budú trénované na predikovanie triedy
train_X = train_data.drop('CLASS', axis=1)
train_y = train_data['CLASS']
#Rozdelenie testovacich dat na predikciu a porovnanie vysledkov predikcie
test_X = test_data.drop('CLASS', axis=1)
test_y = test_data['CLASS']

# Zistujeme najlepsie parametre a atributy

In [229]:
#Vytvoríme decision tree
clf = DecisionTreeClassifier(random_state=0)

#Nastavujeme hyperparametre
cv_params = {'max_depth': [num**2 for num in range(2,10)], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [num**2 for num in range(2,10)]}
ind_params = {'random_state': 0}
optimization = GridSearchCV(clf,
                            cv_params, 
                             scoring = 'f1_macro', cv = 5, n_jobs = -1, verbose=True) 


In [230]:
%%time
optimization.fit(train_X, train_y)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.1s
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_j

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


CPU times: user 1.77 s, sys: 409 ms, total: 2.18 s
Wall time: 15.7 s


[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:   15.5s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [4, 9, 16, 25, 36, 49, 64, 81], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [4, 9, 16, 25, 36, 49, 64, 81]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_macro', verbose=True)

In [231]:
list(filter(lambda x: 'best' in x, dir(optimization)))

['best_estimator_', 'best_index_', 'best_params_', 'best_score_']

In [232]:
# Zistime si najlepsie parametre pre DecisionTree
best_params = optimization.best_params_

In [233]:
# Zistime najlepsie skore pre nase testovacie data
best_score = optimization.best_score_

In [234]:
# Zistime si najlepsie atributy
best_features = get_important_features_names(train_X, optimization.best_estimator_)

In [235]:
#Natrenovanie modelu podla zistených
train_best_data = train_X.loc[:, best_features]
test_best_data = test_X.loc[:, best_features]
# TODO
result = prediction(train_best_data, train_y, test_best_data, test_y, best_params)

[[935   0   2   0]
 [  0  10   1   0]
 [  3   3   3   0]
 [  0   0   0  17]]
                     precision    recall  f1-score   support

           NEGATIVE       1.00      1.00      1.00       937
    OVERREPLACEMENT       0.77      0.91      0.83        11
REPLACEMENT_THERAPY       0.50      0.33      0.40         9
   UNDERREPLACEMENT       1.00      1.00      1.00        17

        avg / total       0.99      0.99      0.99       974



In [236]:
test_best_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 974 entries, 2798 to 971
Data columns (total 5 columns):
AGE             974 non-null float64
ON_THYROXINE    974 non-null bool
TSH             974 non-null float64
T4U_MEASURED    974 non-null bool
FTI             974 non-null float64
dtypes: bool(2), float64(3)
memory usage: 72.3 KB


In [237]:
print("Predikovanie na trenovacich datach: ",result.score(train_best_data, train_y))
print("Predikovanie na testovacich datach: ",result.score(test_best_data, test_y))
classifiers = classifiers.append([["Decision tree", result.score(train_best_data, train_y), result.score(test_best_data, test_y)]])

Predikovanie na trenovacich datach:  0.994639027877055
Predikovanie na testovacich datach:  0.9907597535934292


In [238]:
# Visualize data
import graphviz 
dot_data = tree.export_graphviz(result, out_file=None, 
                         feature_names=best_features,  
                         class_names=train_y.unique(),  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = graphviz.Source(dot_data)  
graph.render("img/tree_best") 
graph

NameError: name 'tree' is not defined

# KNN klasifikátor

In [239]:
cls = KNeighborsClassifier(6)
model = cls.fit(train_X, train_y)
y_pred_train = cls.predict(train_X)
y_pred_test = cls.predict(test_X)


In [240]:
print("Predikovanie na trenovacich datach: ",accuracy_score(train_y, y_pred_train))
print("Predikovanie na testovacich datach: ",accuracy_score(test_y, y_pred_test))
classifiers = classifiers.append([["KNeighbors", accuracy_score(train_y, y_pred_train), accuracy_score(test_y, y_pred_test)]])

Predikovanie na trenovacich datach:  0.9689063616869192
Predikovanie na testovacich datach:  0.9620123203285421


In [241]:
print(confusion_matrix(test_y, y_pred_test))
print(classification_report(test_y, y_pred_test))

[[937   0   0   0]
 [ 11   0   0   0]
 [  9   0   0   0]
 [ 17   0   0   0]]
                     precision    recall  f1-score   support

           NEGATIVE       0.96      1.00      0.98       937
    OVERREPLACEMENT       0.00      0.00      0.00        11
REPLACEMENT_THERAPY       0.00      0.00      0.00         9
   UNDERREPLACEMENT       0.00      0.00      0.00        17

        avg / total       0.93      0.96      0.94       974



  'precision', 'predicted', average, warn_for)


# Random Forest Classifier

In [242]:
from sklearn.ensemble import RandomForestClassifier
cls = RandomForestClassifier(criterion="entropy")
model = cls.fit(train_X, train_y)

In [243]:
cls.score(train_X, train_y)

0.9985704074338814

In [244]:
from sklearn.metrics import accuracy_score
train_preds = model.predict(train_X)
accuracy_score(train_y, train_preds)

0.9985704074338814

In [245]:
cls.score(test_X, test_y)

0.9743326488706365

In [246]:
classifiers = classifiers.append([["Random Forest", accuracy_score(train_y, train_preds), cls.score(test_X, test_y)]])

# SVC

In [247]:
# classifier = KNeighborsClassifier(3)
classifier = svm.LinearSVR(kernel='linear', probability=True,
                     random_state=0)

TypeError: __init__() got an unexpected keyword argument 'kernel'

In [248]:
classifier.fit(train_X, train_y) # chcem pravdepodobnosti zaradenia do tried


NameError: name 'classifier' is not defined

In [249]:
probabilities = predict_proba(text_X)
print(classification_report(test_y, probabilities))

NameError: name 'predict_proba' is not defined

# Porovnanie klasifikátorov

V tejto časti sa budeme venovať porovnaniu výsledkov jednotlivých klasifikátorov. 

In [253]:
classifiers.columns = ['CLASSIFIER_NAME', 'TRAIN_ACCURACY', 'TEST_ACCURACY']
classifiers

Unnamed: 0,CLASSIFIER_NAME,TRAIN_ACCURACY,TEST_ACCURACY
0,Decision tree,0.994639,0.99076
0,KNeighbors,0.968906,0.962012
0,Random Forest,0.99857,0.974333
