# Metody ewolucyjne i uczenie się maszyn
## Na kilku zadaniach klasyfikacji z repozytorium UCI porównać metody xgboost i catboost
### Agnieszka Czaplicka, Bartosz Sowul

# test: instalacja potrzebnych modułów

In [2]:
!pip install catboost
!pip install xgboost
!pip install matplotlib
!pip install pandas
!pip install numpy
!pip install scikit_learn
!pip install hyperopt

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/97/83/d5667408fc36f9a83871ededf01ae4e49d4a1d5fed15cb973623975f2634/catboost-0.12.1.1-cp37-none-manylinux1_x86_64.whl (55.5MB)
[K    100% |████████████████████████████████| 55.5MB 498kB/s ta 0:00:01
Collecting enum34 (from catboost)
  Downloading https://files.pythonhosted.org/packages/af/42/cb9355df32c69b553e72a2e28daee25d1611d2c0d9c272aa1d34204205b2/enum34-1.1.6-py3-none-any.whl
[31mtwisted 18.7.0 requires PyHamcrest>=1.9.0, which is not installed.[0m
Installing collected packages: enum34, catboost
Successfully installed catboost-0.12.1.1 enum34-1.1.6
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/54/21/8b2ec99862903a6d3aed62ce156d21d114b8666e669c46d9e54041df9496/xgboost-0.81-py2.py3-none-manylinux1_x86_64.whl (16.6MB)
[

## Ładowanie potrzebnych modułów

In [2]:
import catboost as cb
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
import xgboost as xgb

## Ładowanie danych

In [7]:
# przyklad na zbiorze breast

from sklearn.preprocessing import MinMaxScaler, Imputer
def prepare(filename):
    df = pd.read_csv(filename, header=None)
    df = df.replace('?', np.NaN)
    df[[6]] = df[[6]].fillna(value=0)

    array = df.values
    X = array[:,1:10]
    y = np.array([1 if elem == 2 else 0 for elem in df[10]])

    imputer = Imputer()
    X = imputer.fit_transform(X)
    scaler = MinMaxScaler(feature_range=(0, 1))
    X = scaler.fit_transform(X)
    return X, y

X, y = prepare('./data/breast/breast-cancer-wisconsin.data.txt')
X[:10]

array([[ 0.44444444,  0.        ,  0.        ,  0.        ,  0.11111111,
         0.1       ,  0.22222222,  0.        ,  0.        ],
       [ 0.44444444,  0.33333333,  0.33333333,  0.44444444,  0.66666667,
         1.        ,  0.22222222,  0.11111111,  0.        ],
       [ 0.22222222,  0.        ,  0.        ,  0.        ,  0.11111111,
         0.2       ,  0.22222222,  0.        ,  0.        ],
       [ 0.55555556,  0.77777778,  0.77777778,  0.        ,  0.22222222,
         0.4       ,  0.22222222,  0.66666667,  0.        ],
       [ 0.33333333,  0.        ,  0.        ,  0.22222222,  0.11111111,
         0.1       ,  0.22222222,  0.        ,  0.        ],
       [ 0.77777778,  1.        ,  1.        ,  0.77777778,  0.66666667,
         1.        ,  0.88888889,  0.66666667,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.11111111,
         1.        ,  0.22222222,  0.        ,  0.        ],
       [ 0.11111111,  0.        ,  0.11111111,  0.        ,  0

## XGBoost - optymalizacja hiperparametrów

In [None]:
#learning_rate
#gamma
#max_depth
#min_child_weight
#subsample
#colsample_bytree
#reg_alpha
#reg_lambda
#n_estimators

def objective(params):
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
    }
    
    clf = xgb.XGBClassifier(
        n_estimators=250,
        learning_rate=0.05,
        n_jobs=4,
        **params
    )
    
    score = cross_val_score(clf, X, Y, scoring=, cv=StratifiedKFold()).mean()
    print("Score {:.3f} params {}".format(score, params))
    return score

space = {
    'max_depth': hp.quniform('max_depth', 2, 8, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 0.5),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10)

print("Hyperopt estimated optimum {}".format(best))

## CatBoost - optymalizacja hiperparametrów

In [None]:
#learning_rate
#depth
#l2_leaf_reg
#rsm
#random_strength
#iterations

def objective(params):
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
    }
    
    clf = xgb.XGBClassifier(
        n_estimators=250,
        learning_rate=0.05,
        n_jobs=4,
        **params
    )
    
    score = cross_val_score(clf, X, Y, scoring=, cv=StratifiedKFold()).mean()
    print("Score {:.3f} params {}".format(score, params))
    return score

space = {
    'max_depth': hp.quniform('max_depth', 2, 8, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 0.5),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10)

print("Hyperopt estimated optimum {}".format(best))

## Open In Colab Badge

Anybody can open a copy of any github-hosted notebook within Colab. To make it easier to give people access to live views of GitHub-hosted notebooks,
colab provides a [shields.io](http://shields.io/)-style badge, which appears as follows:

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)

The markdown for the above badge is the following:

```markdown
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)
```

The HTML equivalent is:

```HTML
<a href="https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
```

Remember to replace the notebook URL in this template with the notebook you want to link to.

# Ładowanie i przygotowanie zbiorów danych

In [4]:
# zbiory Adult/Annealing/Breast

from sklearn.preprocessing import MinMaxScaler, Imputer


def prepare(filename):
    df = pd.read_csv(filename, header=None)
    # df = df.replace('?', np.NaN)
    # df = df.fillna(value=0)
    # print(df.select_dtypes(exclude=['int', np.float]))
    if filename[-32:] == 'breast-cancer-wisconsin.data.txt':
        df[6] = df[6].replace('?', 0)
        df[6] = df[6].astype('int', errors='ignore')
        df[6] = df[6].replace(0, np.median(df[6]))
    #print(df.dtypes)

    array = df.values
    X = array[:, 0:df.shape[1]-1]

    for i in range(np.shape(X)[1]):
        if df[i].dtype == object:
            names_in_col = df[i].unique()
            names_in_col = sorted(names_in_col, key=lambda v: (v.lower(), v))
            if names_in_col[0].strip() == '?':
                names_in_col = names_in_col[1:]
            elif names_in_col[len(names_in_col) - 1].strip() == '?':
                names_in_col = names_in_col[:-1]
            print(names_in_col)
            col_dict = dict(zip(names_in_col, range(1, len(names_in_col)+1)))
            X[:, i] = np.array([col_dict.get(elem, np.NaN) for elem in df[i]])
        else:
            df[i] = df[i].replace('?', np.NaN)
            # df[i] = df[i].fillna(value=0)
            X[:, i] = np.array(df[i])

    if df[df.shape[1]-1].dtype == object:
        names_in_col = df[df.shape[1]-1].unique()
        names_in_col = sorted(names_in_col, key=lambda v: (v.lower(), v))
        print(names_in_col)
        col_dict = dict(zip(names_in_col, range(len(names_in_col))))
        y = np.array([col_dict.get(elem) for elem in df[df.shape[1]-1]])
    else:
        y = array[:, df.shape[1]-1]

    imputer = Imputer(missing_values='NaN', strategy='median', axis=0)
    X = imputer.fit_transform(X)
    scaler = MinMaxScaler(feature_range=(0, 1))
    X = scaler.fit_transform(X)
    return X, y

print("Adult")
X, y = prepare('./data/adult/adult.data.txt')
print("\n------------------------------------------------\n\nAnnealing")
X2, y2 = prepare('./data/annealing/anneal.data.txt')
X2 = X2[:, np.concatenate((np.arange(5), [8, 26, 27, 28, 29, 31]), axis=None)]
print("\n------------------------------------------------\n\nBreast")
X3, y3 = prepare('./data/breast/breast-cancer-wisconsin.data.txt')
X3 = X3[:, 1:]

Adult
[' Federal-gov', ' Local-gov', ' Never-worked', ' Private', ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay']
[' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate', ' HS-grad', ' Masters', ' Preschool', ' Prof-school', ' Some-college']
[' Divorced', ' Married-AF-spouse', ' Married-civ-spouse', ' Married-spouse-absent', ' Never-married', ' Separated', ' Widowed']
[' Adm-clerical', ' Armed-Forces', ' Craft-repair', ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners', ' Machine-op-inspct', ' Other-service', ' Priv-house-serv', ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support', ' Transport-moving']
[' Husband', ' Not-in-family', ' Other-relative', ' Own-child', ' Unmarried', ' Wife']
[' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other', ' White']
[' Female', ' Male']
[' Cambodia', ' Canada', ' China', ' Columbia', ' Cuba', ' Dominican-Republic', ' Ecuador',