# Metody ewolucyjne i uczenie się maszyn
## Na kilku zadaniach klasyfikacji z repozytorium UCI porównać metody xgboost i catboost
### Agnieszka Czaplicka, Bartosz Sowul

# test: instalacja potrzebnych modułów

In [1]:
!pip install catboost
!pip install xgboost
!pip install matplotlib
!pip install pandas
!pip install numpy
!pip install scikit_learn
!pip install hyperopt



## Ładowanie potrzebnych modułów

In [1]:
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold

## Ładowanie i przygotowanie danych

In [3]:
# przyklad na zbiorze breast

from sklearn.preprocessing import MinMaxScaler, Imputer
def prepare(filename):
    df = pd.read_csv(filename, header=None)
    df = df.replace('?', np.NaN)
    df[[6]] = df[[6]].fillna(value=0)

    array = df.values
    X = array[:,1:10]
    y = np.array([1 if elem == 2 else 0 for elem in df[10]])

    imputer = Imputer()
    X = imputer.fit_transform(X)
    scaler = MinMaxScaler(feature_range=(0, 1))
    X = scaler.fit_transform(X)
    return X, y

X, y = prepare('./data/breast/breast-cancer-wisconsin.data.txt')
X[:10]

array([[ 0.44444444,  0.        ,  0.        ,  0.        ,  0.11111111,
         0.1       ,  0.22222222,  0.        ,  0.        ],
       [ 0.44444444,  0.33333333,  0.33333333,  0.44444444,  0.66666667,
         1.        ,  0.22222222,  0.11111111,  0.        ],
       [ 0.22222222,  0.        ,  0.        ,  0.        ,  0.11111111,
         0.2       ,  0.22222222,  0.        ,  0.        ],
       [ 0.55555556,  0.77777778,  0.77777778,  0.        ,  0.22222222,
         0.4       ,  0.22222222,  0.66666667,  0.        ],
       [ 0.33333333,  0.        ,  0.        ,  0.22222222,  0.11111111,
         0.1       ,  0.22222222,  0.        ,  0.        ],
       [ 0.77777778,  1.        ,  1.        ,  0.77777778,  0.66666667,
         1.        ,  0.88888889,  0.66666667,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.11111111,
         1.        ,  0.22222222,  0.        ,  0.        ],
       [ 0.11111111,  0.        ,  0.11111111,  0.        ,  0

In [2]:
# zbiory Adult/Annealing/Breast/CTG

from sklearn.preprocessing import MinMaxScaler, Imputer


def prepare(filename):
    if filename[-7:] == 'CTG.xls':
        df = pd.read_excel(filename, sheet_name='Data', header=None, usecols='K:AE,AT', skiprows=2, nrows=2126)
    else:
        df = pd.read_csv(filename, header=None)
    
    if filename[-32:] == 'breast-cancer-wisconsin.data.txt':
        df[6] = df[6].replace('?', 0)
        df[6] = df[6].astype('int', errors='ignore')
        df[6] = df[6].replace(0, np.median(df[6]))
    #print(df.dtypes)

    array = df.values
    X = array[:, 0:df.shape[1]-1]

    for i in range(np.shape(X)[1]):
        if df[i].dtype == object:
            names_in_col = df[i].unique()
            names_in_col = sorted(names_in_col, key=lambda v: (v.lower(), v))
            if names_in_col[0].strip() == '?':
                names_in_col = names_in_col[1:]
            elif names_in_col[len(names_in_col) - 1].strip() == '?':
                names_in_col = names_in_col[:-1]
            print(names_in_col)
            col_dict = dict(zip(names_in_col, range(1, len(names_in_col)+1)))
            X[:, i] = np.array([col_dict.get(elem, np.NaN) for elem in df[i]])
        else:
            df[i] = df[i].replace('?', np.NaN)
            # df[i] = df[i].fillna(value=0)
            X[:, i] = np.array(df[i])

    if df[df.shape[1]-1].dtype == object:
        names_in_col = df[df.shape[1]-1].unique()
        names_in_col = sorted(names_in_col, key=lambda v: (v.lower(), v))
        print(names_in_col)
        col_dict = dict(zip(names_in_col, range(len(names_in_col))))
        y = np.array([col_dict.get(elem) for elem in df[df.shape[1]-1]])
    else:
        y = array[:, df.shape[1]-1]

    imputer = Imputer(missing_values='NaN', strategy='median', axis=0)
    X = imputer.fit_transform(X)
    # scaler = MinMaxScaler(feature_range=(0, 1)) # skalowanie przy walidacji
    # X = scaler.fit_transform(X)
    return X, y


print("Adult")
X, y = prepare('./data/adult/adult.data.txt')
print("\n------------------------------------------------\n\nAnnealing")
X2, y2 = prepare('./data/annealing/anneal.data.txt')
X2 = X2[:, np.concatenate((np.arange(5), [8, 26, 27, 28, 29, 31]), axis=None)]
print("\n------------------------------------------------\n\nBreast")
X3, y3 = prepare('./data/breast/breast-cancer-wisconsin.data.txt')
X3 = X3[:, 1:]
print("\n------------------------------------------------\n\nCTG")
X4, y4 = prepare('./data/cardiotocography/CTG.xls')

Adult
[' Federal-gov', ' Local-gov', ' Never-worked', ' Private', ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay']
[' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate', ' HS-grad', ' Masters', ' Preschool', ' Prof-school', ' Some-college']
[' Divorced', ' Married-AF-spouse', ' Married-civ-spouse', ' Married-spouse-absent', ' Never-married', ' Separated', ' Widowed']
[' Adm-clerical', ' Armed-Forces', ' Craft-repair', ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners', ' Machine-op-inspct', ' Other-service', ' Priv-house-serv', ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support', ' Transport-moving']
[' Husband', ' Not-in-family', ' Other-relative', ' Own-child', ' Unmarried', ' Wife']
[' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other', ' White']
[' Female', ' Male']
[' Cambodia', ' Canada', ' China', ' Columbia', ' Cuba', ' Dominican-Republic', ' Ecuador',

## Optymalizacja hiperparametrów przy użyciu RandomizedSearchCV

### (wykorzystano kod z dokumentacji scikit-learna, dostępny pod adresem https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html)

In [3]:
#accuracy, F1 score i logloss
from time import time

from catboost import CatBoostClassifier
import scipy.stats as st
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

def find_params(clf, X=None, y=None):
    # Utility function to report best scores
    def report(results, n_top=3):
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results['rank_test_score'] == i)
            for candidate in candidates:
                print("Model with rank: {0}".format(i))
                print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                      results['mean_test_score'][candidate],
                      results['std_test_score'][candidate]))
                print("Parameters: {0}".format(results['params'][candidate]))
                print("")

    params = None
    if type(clf) == XGBClassifier:
        print('ok')
#learning_rate
#gamma
#max_depth
#min_child_weight
#subsample
#colsample_bytree
#reg_alpha
#reg_lambda
#n_estimators
        params = {
            'learning_rate': st.uniform(0.05, 1.0),
            'gamma': st.uniform(0, 10),
            'max_depth': st.randint(3, 17),
            'min_child_weight': st.expon(0, 50),
            'subsample': st.beta(10, 1),
            'colsample_bytree': st.beta(10, 1) ,
            'reg_alpha': st.expon(0, 50),
            'reg_lambda': st.randint(0, 100),
            'n_estimators': st.randint(3, 50),
        }
    elif type(clf) == CatBoostClassifier:
#learning_rate
#depth
#l2_leaf_reg
#rsm
#random_strength
#iterations
        print('okok')
        params = {
            'learning_rate': st.uniform(0.05, 1.0),
            'depth': st.randint(3, 17),
            'l2_leaf_reg': st.randint(0, 100),
            'rsm': st.uniform(0.0, 1.0),
            'random_strength': st.uniform(0.05, 10),
            'iterations': st.randint(3, 50),
        }
    else:
        print('Błąd: {} klasyfikator nie jest obsługiwany.'.format(type(clf)))
    
    # run randomized search
    n_iter_search = 10
    random_search = RandomizedSearchCV(clf, param_distributions=params,
                                       n_iter=n_iter_search, cv=5,
                                       return_train_score=True, random_state=111)

    start = time()
    random_search.fit(X, y)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.cv_results_)

def get_scores():
    pass

from sklearn.datasets import load_digits

digits = load_digits()
X, y = digits.data, digits.target

clf1 = XGBClassifier()
clf2 = CatBoostClassifier(loss_function='MultiClass', eval_metric='Accuracy', verbose=False)

find_params(clf2, X, y)

okok



Iteration with suspicious time 3.26 sec ignored in overall statistics.

Iteration with suspicious time 3.15 sec ignored in overall statistics.

Iteration with suspicious time 3.47 sec ignored in overall statistics.

Iteration with suspicious time 3.17 sec ignored in overall statistics.

Iteration with suspicious time 3.47 sec ignored in overall statistics.

Iteration with suspicious time 3.47 sec ignored in overall statistics.

Iteration with suspicious time 3.29 sec ignored in overall statistics.

Iteration with suspicious time 3.21 sec ignored in overall statistics.

Iteration with suspicious time 3.26 sec ignored in overall statistics.

Iteration with suspicious time 3.2 sec ignored in overall statistics.

Iteration with suspicious time 3.33 sec ignored in overall statistics.

Iteration with suspicious time 3.19 sec ignored in overall statistics.

Iteration with suspicious time 1.89 sec ignored in overall statistics.

Iteration with suspicious time 3.47 sec ignored in overall stati

RandomizedSearchCV took 468.86 seconds for 10 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.199 (std: 0.007)
Parameters: {'depth': 11, 'iterations': 31, 'l2_leaf_reg': 14, 'learning_rate': 0.15403690664266384, 'random_strength': 5.329578051059891, 'rsm': 0.28181935600638564}

Model with rank: 2
Mean validation score: 0.198 (std: 0.006)
Parameters: {'depth': 7, 'iterations': 47, 'l2_leaf_reg': 84, 'learning_rate': 0.35178945091410735, 'random_strength': 8.589064583872442, 'rsm': 0.2977159790346421}

Model with rank: 2
Mean validation score: 0.198 (std: 0.004)
Parameters: {'depth': 6, 'iterations': 36, 'l2_leaf_reg': 38, 'learning_rate': 0.8938018695545712, 'random_strength': 7.312018875875911, 'rsm': 0.27030569335406984}



## K-krotna walidacja krzyżowa

In [None]:
def cross_validation(X, y, K, clf):
    n = np.shape(X)[0]
    K_cv_ind = np.random.permutation(n)
    for k in range(K):
        print('\nk = ', k)
        ind_test = K_cv_ind[round(k * (1 / K) * n): round((k+1) * (1 / K) * n)]
        #print('test:', ind_test)
        X_test = X[ind_test, :]
        y_test = y[ind_test]
        if k != 0:
            ind_train = K_cv_ind[np.concatenate((np.arange(round(k * (1 / K) * n)),
                                                 np.arange(round((k+1) * (1 / K) * n), n)), axis=None)]
            X_train = X[ind_train, :]
            y_train = y[ind_train]
        else:
            ind_train = K_cv_ind[range(round((k+1) * (1 / K) * n), n)]
            X_train = X[ind_train, :]
            y_train = y[ind_train]
        #print('train:', ind_train)

        max_train = np.amax(X_train, axis=0)
        min_train = np.amin(X_train, axis=0)
        X_train = 2 * (X_train - min_train) / (max_train - min_train) - 1
        X_test = 2 * (X_test - min_train) / (max_train - min_train) - 1

        find_params(clf, X_train, y_train)
        #params = find_params(clf, X_train, y_train)
        #values = predict(X_test, y_test)
        
cross_validation(X, y, 5, clf1)

## Open In Colab Badge

Anybody can open a copy of any github-hosted notebook within Colab. To make it easier to give people access to live views of GitHub-hosted notebooks,
colab provides a [shields.io](http://shields.io/)-style badge, which appears as follows:

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)

The markdown for the above badge is the following:

```markdown
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)
```

The HTML equivalent is:

```HTML
<a href="https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
```

Remember to replace the notebook URL in this template with the notebook you want to link to.