# Реализация модели классификации на примере библиотеки Catboost

## Краткая документация к ноутбуку

В ноутбуке представлена релизация модели catboost являющаяся частью полноценного pipeline для машинного обучения. Для примера, используется один из датасетов внутри библиотеки catboost, на котором продемонстрировано использование Catboost для последующей имплементации в пайлпайн.  
**Отличия библиотеки catboost от аналогов. Особенности реализации**  
1. Оптимизирована для работы с категориальными фичами
2. Хорошо оптимизирована для обучения на GPU
3. Использует решающие деревья глубины 1-2 в качестве базовых моделей
4. 

## Подключение необходимых библиотек

In [11]:
import catboost
import catboost.datasets
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
import pandas as pd

## Загрузим данные и подготовим их для обучения

### Подгрузим датасет из Catboost и соберем их в одно целое

In [15]:
dataset_name = "adult" 
data = catboost.datasets.adult()

In [55]:
first_part = data[0]
first_part.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [59]:
first_part.shape[0]

32561

In [61]:
second_part = data[1]
second_part.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18.0,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K


In [67]:
second_part.shape[0]

16281

In [63]:
df = pd.concat([first_part, second_part], ignore_index = True)

In [69]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [71]:
df.shape

(48842, 15)

### Разобъем датасет на обучающую и тестовую выборки

In [77]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [129]:
X = df.drop(columns = 'income')

In [131]:
y = df['income']

Переделаем столбец y. Закодируем заработок >50к как 1, а <=50к как 0

In [134]:
y = y.replace({'>50K': 1, '<=50K': 0})

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Соберем список категориальных фичей

In [139]:
categorical_features_indices = X.select_dtypes(include=['object']).columns

for col in categorical_features_indices:
    X[col] = X[col].astype('category')

In [141]:
categorical_features_indices

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')

## Инициализируем простейшую модель catboost

In [145]:
model = CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, loss_function='Logloss', verbose=10)
model.fit(X_train, y_train, cat_features=categorical_features_indices)

CatBoostError: feature names should be a sequence, but got Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')

In [None]:
accuracy = model.score(X_test, y_test)
print(f'Accuracy on test data: {accuracy:.2f}')