# Metody ewolucyjne i uczenie się maszyn
## Na kilku zadaniach klasyfikacji z repozytorium UCI porównać metody xgboost i catboost
### Agnieszka Czaplicka, Bartosz Sowul

# test: instalacja potrzebnych modułów

In [11]:
!pip install catboost
!pip install xgboost
!pip install matplotlib
!pip install pandas
!pip install numpy
!pip install scikit_learn
!pip install xlrd

Collecting xlrd
[?25l  Downloading https://files.pythonhosted.org/packages/b0/16/63576a1a001752e34bf8ea62e367997530dc553b689356b9879339cf45a4/xlrd-1.2.0-py2.py3-none-any.whl (103kB)
[K    100% |████████████████████████████████| 112kB 1.1MB/s 
[?25hInstalling collected packages: xlrd
Successfully installed xlrd-1.2.0


## Ładowanie potrzebnych modułów

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold

## Funkcje pomocnicze

### Przygotowanie zbiorów

In [3]:
# zbiory Adult/Annealing/Breast/CTG

from sklearn.preprocessing import MinMaxScaler, Imputer

def prepare(filename):
    if filename[-7:] == 'CTG.xls':
        df = pd.read_excel(filename, sheet_name='Data', header=None, usecols='K:AE,AT', skiprows=2, nrows=2126)
    else:
        df = pd.read_csv(filename, header=None)
    
    if filename[-32:] == 'breast-cancer-wisconsin.data.txt':
        df[6] = df[6].replace('?', 0)
        df[6] = df[6].astype('int', errors='ignore')
        df[6] = df[6].replace(0, np.median(df[6]))
    #print(df.dtypes)

    array = df.values
    X = array[:, 0:df.shape[1]-1]
    print('Przetwarzanie wstępne...')
    for i in range(np.shape(X)[1]):
        if df[i].dtype == object:
            names_in_col = df[i].unique()
            names_in_col = sorted(names_in_col, key=lambda v: (v.lower(), v))
            if names_in_col[0].strip() == '?':
                names_in_col = names_in_col[1:]
            elif names_in_col[len(names_in_col) - 1].strip() == '?':
                names_in_col = names_in_col[:-1]
            #print(names_in_col)
            col_dict = dict(zip(names_in_col, range(1, len(names_in_col)+1)))
            X[:, i] = np.array([col_dict.get(elem, np.NaN) for elem in df[i]])
        else:
            df[i] = df[i].replace('?', np.NaN)
            # df[i] = df[i].fillna(value=0)
            X[:, i] = np.array(df[i])

    if df[df.shape[1]-1].dtype == object:
        names_in_col = df[df.shape[1]-1].unique()
        names_in_col = sorted(names_in_col, key=lambda v: (v.lower(), v))
        #print(names_in_col)
        col_dict = dict(zip(names_in_col, range(len(names_in_col))))
        y = np.array([col_dict.get(elem) for elem in df[df.shape[1]-1]])
    else:
        y = array[:, df.shape[1]-1]

    imputer = Imputer(missing_values='NaN', strategy='median', axis=0)
    X = imputer.fit_transform(X)
    # scaler = MinMaxScaler(feature_range=(0, 1)) # skalowanie przy walidacji
    # X = scaler.fit_transform(X)
    print('Przetworzono.')
    return X, y

'''
print("Adult")
X, y = prepare('./data/adult/adult.data.txt')
print("\n------------------------------------------------\n\nAnnealing")
X2, y2 = prepare('./data/annealing/anneal.data.txt')
X2 = X2[:, np.concatenate((np.arange(5), [8, 26, 27, 28, 29, 31]), axis=None)]
print("\n------------------------------------------------\n\nBreast")
X3, y3 = prepare('./data/breast/breast-cancer-wisconsin.data.txt')
X3 = X3[:, 1:]
print("\n------------------------------------------------\n\nCTG")
X4, y4 = prepare('./data/cardiotocography/CTG.xls')
'''

'\nprint("Adult")\nX, y = prepare(\'./data/adult/adult.data.txt\')\nprint("\n------------------------------------------------\n\nAnnealing")\nX2, y2 = prepare(\'./data/annealing/anneal.data.txt\')\nX2 = X2[:, np.concatenate((np.arange(5), [8, 26, 27, 28, 29, 31]), axis=None)]\nprint("\n------------------------------------------------\n\nBreast")\nX3, y3 = prepare(\'./data/breast/breast-cancer-wisconsin.data.txt\')\nX3 = X3[:, 1:]\nprint("\n------------------------------------------------\n\nCTG")\nX4, y4 = prepare(\'./data/cardiotocography/CTG.xls\')\n'

### K-krotna walidacja krzyżowa

In [4]:
def cross_validation(X, y, K, clf):
    n = np.shape(X)[0]
    K_cv_ind = np.random.permutation(n)
    for k in range(K):
        print('\nk = ', k)
        ind_test = K_cv_ind[round(k * (1 / K) * n): round((k+1) * (1 / K) * n)]
        #print('test:', ind_test)
        X_test = X[ind_test, :]
        y_test = y[ind_test]
        if k != 0:
            ind_train = K_cv_ind[np.concatenate((np.arange(round(k * (1 / K) * n)),
                                                 np.arange(round((k+1) * (1 / K) * n), n)), axis=None)]
            X_train = X[ind_train, :]
            y_train = y[ind_train]
        else:
            ind_train = K_cv_ind[range(round((k+1) * (1 / K) * n), n)]
            X_train = X[ind_train, :]
            y_train = y[ind_train]
        #print('train:', ind_train)

        max_train = np.amax(X_train, axis=0)
        min_train = np.amin(X_train, axis=0)
        X_train = 2 * (X_train - min_train) / (max_train - min_train) - 1
        X_test = 2 * (X_test - min_train) / (max_train - min_train) - 1

        find_params(clf, X_train, y_train)
        #params = find_params(clf, X_train, y_train)
        #values = predict(X_test, y_test)
        
#cross_validation(X, y, 5, clf1)

### Optymalizacja hiperparametrów przy użyciu RandomizedSearchCV

### Funkcje pomocnicze do raportowania i zapisu wyników eksperymentów

In [47]:
import csv
from pathlib import Path

def save_results(results, filename='results.csv'):
    mode = 'w'
    path = Path(filename)
    if path.is_file():
         mode = 'a'
    with open(filename, mode) as f:
        w = csv.writer(f)
        if mode == 'w':
            w.writerow(results[0].keys())
        else:
            pass
        for result in results:
            w.writerow(result.values())

def report(optimizer, algo, scoring, seed, dataset, n_top=3):
    print('BEST')
    print(f"Mean validation score: {optimizer.cv_results_['mean_test_'+scoring[-1]][optimizer.best_index_]:.3f} \
          (std: {optimizer.cv_results_['std_test_'+scoring[-1]][optimizer.best_index_]:.3f})")
    print(f'Params: {optimizer.best_params_}')
    print()
    
    results = []

    for metric in scoring:
        print(f'Najlepsze hiperparametry ze względu na metrykę {metric}:')
        cv_results = [tup[0] for tup in sorted(enumerate(optimizer.cv_results_['mean_test_'+metric]), key=lambda x:x[1])]
        for idx in cv_results[::-1][:3]:
            results.append({'dataset': dataset, 'seed': seed, 'algo': type(algo).__name__,
                            'metric': metric, 'model_id': idx,
                            'mean_test_': optimizer.cv_results_['mean_test_'+metric][idx],
                            'std_test_': optimizer.cv_results_['std_test_'+metric][idx],
                            'params': optimizer.cv_results_['params'][idx]})
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                      optimizer.cv_results_['mean_test_'+metric][idx],
                      optimizer.cv_results_['std_test_'+metric][idx]))
            print("Params: {0}".format(optimizer.cv_results_['params'][idx]))
            print()
    else:
        save_results(results)

In [48]:
#accuracy, F1 score i logloss
from time import time

from catboost import CatBoostClassifier
import scipy.stats as st
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

def find_params(clf, scoring, seed, X=None, y=None):
    params = None
    if type(clf) == XGBClassifier:
        print(f'Optymalizacja hiperparametrów {type(clf).__name__}...')
#learning_rate
#gamma
#max_depth
#min_child_weight
#subsample
#colsample_bytree
#reg_alpha
#reg_lambda
#n_estimators
        params = {
            'learning_rate': st.uniform(0.05, 1.0),
            'gamma': st.uniform(0, 10),
            'max_depth': st.randint(3, 17),
            'min_child_weight': st.expon(0, 50),
            'subsample': st.beta(10, 1),
            'colsample_bytree': st.beta(10, 1) ,
            'reg_alpha': st.expon(0, 50),
            'reg_lambda': st.randint(0, 100),
            'n_estimators': st.randint(3, 50),
        }
    elif type(clf) == CatBoostClassifier:
#learning_rate
#depth
#l2_leaf_reg
#rsm
#random_strength
#iterations
        print(f'Optymalizacja hiperparametrów {type(clf).__name__}...')
        params = {
            'learning_rate': st.uniform(0.05, 1.0),
            'depth': st.randint(3, 17),
            'l2_leaf_reg': st.randint(0, 100),
            'rsm': st.uniform(0.0, 1.0),
            'random_strength': st.uniform(0.05, 10),
            'iterations': st.randint(3, 50),
        }
    else:
        print(f'Błąd: {type(clf)} klasyfikator nie jest obsługiwany.')
    
    # run randomized search
    n_iter_search = 10
    random_search = RandomizedSearchCV(clf, param_distributions=params,
                                       n_iter=n_iter_search, cv=5, scoring=scoring,
                                       refit='neg_log_loss', return_train_score=True, random_state=seed)

    start = time()
    random_search.fit(X, y)
    print(f'\nRandomizedSearchCV trwało {time() - start:.2f} sekund dla {n_iter_search} kandydatów.')
    return random_search

## Testy na zbiorze Adult

In [49]:
print("Zbiór Adult")

scoring = ['accuracy', 'f1', 'neg_log_loss']
seeds = [111]

dataset = 'Adult'
X, y = prepare('./data/adult/adult.data.txt')

for seed in seeds:
    xgb = XGBClassifier()
    ctb = CatBoostClassifier(loss_function='MultiClass', verbose=False)
    
    optimizer_xgb1 = find_params(xgb, scoring, seed, X, y)
    report(optimizer_xgb1, xgb, scoring, seed, dataset)

    optimizer_ctb1 = find_params(ctb, scoring, seed, X, y)
    report(optimizer_ctb1, ctb, scoring, seed, dataset)

Zbiór Adult
Przetwarzanie wstępne...
Przetworzono.
Optymalizacja hiperparametrów XGBClassifier...

RandomizedSearchCV trwało 58.39 sekund dla 10 kandydatów.
BEST
Mean validation score: -0.293           (std: 0.006)
Params: {'colsample_bytree': 0.86900124541044066, 'gamma': 2.1049924581368176, 'learning_rate': 0.85206457718643547, 'max_depth': 3, 'min_child_weight': 8.1923283466419132, 'n_estimators': 39, 'reg_alpha': 11.619779341850629, 'reg_lambda': 20, 'subsample': 0.9003944179579163}

Najlepsze hiperparametry ze względu na metrykę accuracy:
Mean validation score: 0.867 (std: 0.006)
Params: {'colsample_bytree': 0.86900124541044066, 'gamma': 2.1049924581368176, 'learning_rate': 0.85206457718643547, 'max_depth': 3, 'min_child_weight': 8.1923283466419132, 'n_estimators': 39, 'reg_alpha': 11.619779341850629, 'reg_lambda': 20, 'subsample': 0.9003944179579163}

Mean validation score: 0.865 (std: 0.004)
Params: {'colsample_bytree': 0.96822823759893117, 'gamma': 3.6077620064287794, 'learning

learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease l


RandomizedSearchCV trwało 357.91 sekund dla 10 kandydatów.
BEST
Mean validation score: -0.291           (std: 0.004)
Params: {'depth': 5, 'iterations': 46, 'l2_leaf_reg': 86, 'learning_rate': 0.84396256047962848, 'random_strength': 8.4556964872428129, 'rsm': 0.81520745745294065}

Najlepsze hiperparametry ze względu na metrykę accuracy:
Mean validation score: 0.867 (std: 0.003)
Params: {'depth': 5, 'iterations': 46, 'l2_leaf_reg': 86, 'learning_rate': 0.84396256047962848, 'random_strength': 8.4556964872428129, 'rsm': 0.81520745745294065}

Mean validation score: 0.865 (std: 0.004)
Params: {'depth': 6, 'iterations': 36, 'l2_leaf_reg': 38, 'learning_rate': 0.89380186955457119, 'random_strength': 7.3120188758759106, 'rsm': 0.27030569335406984}

Mean validation score: 0.858 (std: 0.004)
Params: {'depth': 7, 'iterations': 47, 'l2_leaf_reg': 84, 'learning_rate': 0.35178945091410735, 'random_strength': 8.5890645838724424, 'rsm': 0.29771597903464209}

Najlepsze hiperparametry ze względu na metr

## Testy na zbiorze Annealing

In [7]:
print("Zbiór Annealing")

scoring = ['accuracy', 'f1', 'neg_log_loss']
seeds = [111]

dataset = 'Annealing'
X, y = prepare('./data/annealing/anneal.data.txt')

for seed in seeds:
    xgb = XGBClassifier()
    ctb = CatBoostClassifier(loss_function='MultiClass', verbose=False)
    
    optimizer_xgb2 = find_params(xgb, scoring, seed, X, y)
    report(optimizer_xgb2, xgb, scoring, seed, dataset)

    optimizer_ctb2 = find_params(ctb, scoring, seed, X, y)
    report(optimizer_ctb2, ctb, scoring, seed, dataset)

Zbiór Annealing
Przetwarzanie wstępne...
Przetworzono.


## Testy na zbiorze Breast

In [8]:
print("Zbiór Breast")

scoring = ['accuracy', 'f1', 'neg_log_loss']
seeds = [111]

dataset = 'Breast'
X, y = prepare('./data/breast/breast-cancer-wisconsin.data.txt')

for seed in seeds:
    xgb = XGBClassifier()
    ctb = CatBoostClassifier(loss_function='MultiClass', verbose=False)
    
    optimizer_xgb3 = find_params(xgb, scoring, seed, X, y)
    report(optimizer_xgb3, xgb, scoring, seed, dataset)

    optimizer_ctb3 = find_params(ctb, scoring, seed, X, y)
    report(optimizer_ctb3, ctb, scoring, seed, dataset)

Zbiór Breast
Przetwarzanie wstępne...
Przetworzono.


## Testy na zbiorze CTG

In [9]:
print("Zbiór CTG")

scoring = ['accuracy', 'f1', 'neg_log_loss']
seeds = [111]

dataset = 'Breast'
X, y = prepare('./data/cardiotocography/CTG.xls')

for seed in seeds:
    xgb = XGBClassifier()
    ctb = CatBoostClassifier(loss_function='MultiClass', verbose=False)
    
    optimizer_xgb4 = find_params(xgb, scoring, seed, X, y)
    report(optimizer_xgb4, xgb, scoring, seed, dataset)

    optimizer_ctb4 = find_params(ctb, scoring, seed, X, y)
    report(optimizer_ctb4, ctb, scoring, seed, dataset)

Zbiór CTG


ModuleNotFoundError: No module named 'xlrd'

## Open In Colab Badge

Anybody can open a copy of any github-hosted notebook within Colab. To make it easier to give people access to live views of GitHub-hosted notebooks,
colab provides a [shields.io](http://shields.io/)-style badge, which appears as follows:

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)

The markdown for the above badge is the following:

```markdown
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)
```

The HTML equivalent is:

```HTML
<a href="https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
```

Remember to replace the notebook URL in this template with the notebook you want to link to.