<a href="https://colab.research.google.com/github/Murcha1990/ML_AI24/blob/main/Lesson9_ClassificationBase/ClassificationAndEncoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  План семинара

1. Линейный классификатор в задаче бинарной классификации
2. Кодирование категориальных признаков

## Задача бинарной классификации

In [None]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.0-py3-none-any.whl (294 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m[31m2.5 MB/s[0m eta [36m0:00:01[0m
Installing collected packages: seaborn
Successfully installed seaborn-0.13.0


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
np.random.seed(42)

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/Murcha1990/ML_AI24/refs/heads/main/Lesson9_ClassificationBase/bike_buyers_clean.csv')

In [None]:
data

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
0,12496,Married,Female,40000,1,Bachelors,Skilled Manual,Yes,0,0-1 Miles,Europe,42,No
1,24107,Married,Male,30000,3,Partial College,Clerical,Yes,1,0-1 Miles,Europe,43,No
2,14177,Married,Male,80000,5,Partial College,Professional,No,2,2-5 Miles,Europe,60,No
3,24381,Single,Male,70000,0,Bachelors,Professional,Yes,1,5-10 Miles,Pacific,41,Yes
4,25597,Single,Male,30000,0,Bachelors,Clerical,No,0,0-1 Miles,Europe,36,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,23731,Married,Male,60000,2,High School,Professional,Yes,2,2-5 Miles,North America,54,Yes
996,28672,Single,Male,70000,4,Graduate Degree,Professional,Yes,0,2-5 Miles,North America,35,Yes
997,11809,Married,Male,60000,2,Bachelors,Skilled Manual,Yes,0,0-1 Miles,North America,38,Yes
998,19664,Single,Male,100000,3,Bachelors,Management,No,3,1-2 Miles,North America,38,No


# Обзор данных

In [None]:
# проверим типы колонок в датасете
data.dtypes

ID                   int64
Marital Status      object
Gender              object
Income               int64
Children             int64
Education           object
Occupation          object
Home Owner          object
Cars                 int64
Commute Distance    object
Region              object
Age                  int64
Purchased Bike      object
dtype: object

In [None]:
X = data.iloc[:,:-1]
X.drop(columns='ID', inplace=True)

y = data['Purchased Bike']

In [None]:
num_cols = X.columns[X.dtypes == 'int64'].tolist()
cat_cols = X.columns[X.dtypes == 'object']

print(f"We have {len(num_cols)} numeric columns: {', '.join(num_cols)}")
print(f"And {len(cat_cols)} categorical columns: {', '.join(cat_cols)}")

We have 4 numeric columns: Income, Children, Cars, Age
And 7 categorical columns: Marital Status, Gender, Education, Occupation, Home Owner, Commute Distance, Region


In [None]:
for col in cat_cols:
    print(col)
    display(X[col].value_counts(normalize=True))
    print()

Marital Status


Marital Status
Married    0.539
Single     0.461
Name: proportion, dtype: float64


Gender


Gender
Male      0.509
Female    0.491
Name: proportion, dtype: float64


Education


Education
Bachelors              0.306
Partial College        0.265
High School            0.179
Graduate Degree        0.174
Partial High School    0.076
Name: proportion, dtype: float64


Occupation


Occupation
Professional      0.276
Skilled Manual    0.255
Clerical          0.177
Management        0.173
Manual            0.119
Name: proportion, dtype: float64


Home Owner


Home Owner
Yes    0.685
No     0.315
Name: proportion, dtype: float64


Commute Distance


Commute Distance
0-1 Miles     0.366
5-10 Miles    0.192
1-2 Miles     0.169
2-5 Miles     0.162
10+ Miles     0.111
Name: proportion, dtype: float64


Region


Region
North America    0.508
Europe           0.300
Pacific          0.192
Name: proportion, dtype: float64




In [None]:
# у нас есть категориальные переменные разных видов!

binary_cols = cat_cols[X[cat_cols].nunique() == 2].tolist()
ordinal_cols = ['Commute Distance', 'Education']
cat_cols = cat_cols.difference(binary_cols + ordinal_cols).tolist()

In [None]:
X.describe()

Unnamed: 0,Income,Children,Cars,Age
count,1000.0,1000.0,1000.0,1000.0
mean,56140.0,1.908,1.452,44.19
std,31081.609779,1.626094,1.124705,11.353537
min,10000.0,0.0,0.0,25.0
25%,30000.0,0.0,1.0,35.0
50%,60000.0,2.0,1.0,43.0
75%,70000.0,3.0,2.0,52.0
max,170000.0,5.0,4.0,89.0


In [None]:
X.describe(include='object')

Unnamed: 0,Marital Status,Gender,Education,Occupation,Home Owner,Commute Distance,Region
count,1000,1000,1000,1000,1000,1000,1000
unique,2,2,5,5,2,5,3
top,Married,Male,Bachelors,Professional,Yes,0-1 Miles,North America
freq,539,509,306,276,685,366,508


In [None]:
# classes are balanced !
y.value_counts(normalize=True)

Purchased Bike
No     0.519
Yes    0.481
Name: proportion, dtype: float64

In [None]:
y

0       No
1       No
2       No
3      Yes
4      Yes
      ... 
995    Yes
996    Yes
997    Yes
998     No
999    Yes
Name: Purchased Bike, Length: 1000, dtype: object

In [None]:
# transform y to numeric column
y = (y == 'Yes').astype(int)
y

0      0
1      0
2      0
3      1
4      1
      ..
995    1
996    1
997    1
998    0
999    1
Name: Purchased Bike, Length: 1000, dtype: int64

# Подготовка данных

## Кодирование категориальных признаков

In [None]:
# run if not installed yet

# !pip install category_encoders

In [None]:
from category_encoders.ordinal import OrdinalEncoder # LabelEncoder
from category_encoders.one_hot import OneHotEncoder # OneHotEncoding
from category_encoders.target_encoder import TargetEncoder # счетчики+сглаживание

In [None]:
X['Education'].unique()

array(['Bachelors', 'Partial College', 'High School',
       'Partial High School', 'Graduate Degree'], dtype=object)

In [None]:
# Ordinal: from categories to numbers

ord_enc = OrdinalEncoder()
ord_enc.fit_transform(X['Education'])

Unnamed: 0,Education
0,1
1,2
2,2
3,1
4,1
...,...
995,3
996,5
997,1
998,1


In [None]:
# One hot: from k categories to k dummy columns

one_hot_enc = OneHotEncoder()

one_hot_enc.fit_transform(X['Education'], drop=True)
# * fit -> определить количество новых столбцов (по кол-ву категорий)
# * transform -> создать новые столбцы
# * fit_transform = fit + transform

# Нужно ли удалять какую-то из колонок после такого кодирования ?

Unnamed: 0,Education_1,Education_2,Education_3,Education_4,Education_5
0,1,0,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0
...,...,...,...,...,...
995,0,0,1,0,0
996,0,0,0,0,1
997,1,0,0,0,0
998,1,0,0,0,0


Target encoding вычисляет значения по формуле

$$\frac{mean(target)\cdot n_{rows} + \alpha \cdot globalMean}{n_{rows} + \alpha} $$

In [None]:
# target encoding: from k categories to posterior probabilites of y == 1 - P(y==1 | category == c1)

tgt_enc = TargetEncoder(smoothing=1)

# smoothing - это коэффициент сглаживания alpha, чем он больше, тем больше регуляризация

tgt_enc.fit_transform(X['Education'], y)

Unnamed: 0,Education
0,0.552288
1,0.449057
2,0.449057
3,0.552288
4,0.552288
...,...
995,0.441341
996,0.540230
997,0.552288
998,0.552288


In [None]:
# энкодер можно применять сразу на весь датафрейм

tgt_enc = TargetEncoder(cols=['Education', 'Gender', 'Region'])
tgt_enc.fit_transform(X, y)

Unnamed: 0,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age
0,Married,0.486762,40000,1,0.552288,Skilled Manual,Yes,0,0-1 Miles,0.493333,42
1,Married,0.475442,30000,3,0.449057,Clerical,Yes,1,0-1 Miles,0.493333,43
2,Married,0.475442,80000,5,0.449057,Professional,No,2,2-5 Miles,0.493333,60
3,Single,0.475442,70000,0,0.552288,Professional,Yes,1,5-10 Miles,0.588542,41
4,Single,0.475442,30000,0,0.552288,Clerical,No,0,0-1 Miles,0.493333,36
...,...,...,...,...,...,...,...,...,...,...,...
995,Married,0.475442,60000,2,0.441341,Professional,Yes,2,2-5 Miles,0.433071,54
996,Single,0.475442,70000,4,0.540230,Professional,Yes,0,2-5 Miles,0.433071,35
997,Married,0.475442,60000,2,0.552288,Skilled Manual,Yes,0,0-1 Miles,0.433071,38
998,Single,0.475442,100000,3,0.552288,Management,No,3,1-2 Miles,0.433071,38


Помимо сглаживания, для борьбы с переобучением при таргет энкодинге в лекции предлагались и другие методы

- Добавление случайного шума
- Вычисление счетчиков на кросс-валидации
- Expanding mean encoding

Первые две идеи реализованы в классе LeaveOneOut

- значения считаются на основе кросс-валидации вида leave one out (то есть значение энкодинга для конкретного наблюдения будет считаться по всем наблюдениям, кроме этого)
- параметр sigma отвечает за дисперсию случайного шума, который добавляется к значению энкодинга (чем больше sigma, тем больше регуляризация)

In [None]:
from category_encoders.leave_one_out import LeaveOneOutEncoder

loo_enc = LeaveOneOutEncoder(sigma=3.)

loo_enc.fit_transform(X['Education'], y)

Unnamed: 0,Education
0,1.379784
1,0.263787
2,1.326609
3,3.067564
4,0.163891
...,...
995,0.068666
996,3.436731
997,1.609786
998,-0.395370


## Масштабирование числовых признаков

In [None]:
X['Income']

0       40000
1       30000
2       80000
3       70000
4       30000
        ...  
995     60000
996     70000
997     60000
998    100000
999     60000
Name: Income, Length: 1000, dtype: int64

In [None]:
MinMaxScaler()

In [None]:
MaxAbsScaler()

In [None]:
scaler.mean_

array([56140.])

In [None]:
scaler.var_

array([9.651004e+08])

In [None]:
import numpy as np

In [None]:
np.nan

nan

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler

scaler= StandardScaler() # x -> (x-mean) / std
scaler.fit_transform(X[['Income']])

# scaler.fit(Xtrain)

# scaler.transform(Xtrain)
# scaler.transform(Xtest)

# fit -> вычисляет параметры преобразования: mean, std (по TRAIN)
# transform -> преобразует столбец по формуле. (по TRAIN и по TEST)

array([[-0.51953796],
       [-0.8414326 ],
       [ 0.76804062],
       [ 0.44614598],
       [-0.8414326 ],
       [-1.48522189],
       [ 3.34319779],
       [-0.51953796],
       [-1.16332725],
       [-1.16332725],
       [-0.8414326 ],
       [ 1.08993527],
       [ 3.66509243],
       [-0.51953796],
       [ 0.12425133],
       [-1.48522189],
       [-0.8414326 ],
       [-0.8414326 ],
       [-0.51953796],
       [-1.16332725],
       [-0.51953796],
       [ 0.76804062],
       [-0.51953796],
       [ 0.76804062],
       [-0.51953796],
       [-0.8414326 ],
       [-0.8414326 ],
       [ 1.41182991],
       [ 0.44614598],
       [-1.16332725],
       [-1.16332725],
       [-1.48522189],
       [-1.16332725],
       [ 0.76804062],
       [ 1.08993527],
       [-1.48522189],
       [-1.48522189],
       [-0.8414326 ],
       [-1.16332725],
       [-1.48522189],
       [-0.8414326 ],
       [-0.51953796],
       [-1.48522189],
       [ 3.66509243],
       [-1.16332725],
       [-1

In [None]:
MinMaxScaler()

In [None]:
MaxAbsScaler()

Есть две проблемы:
- класc StandardScaler не умеет работать только на части колонок датафрейма
- классы sklearn возвращают numpy arrays, а не pandas dataframe, что не удобно

In [None]:
num_cols

['Income', 'Children', 'Cars', 'Age']

In [None]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([('scaler', StandardScaler(), num_cols)], remainder='passthrough') # 'drop'

In [None]:
ct.fit_transform(X)

array([[-0.5195379574051056, -0.5586728696623785, -1.2916513760469168,
        ..., 'Yes', '0-1 Miles', 'Europe'],
       [-0.8414326026375131, 0.6718841119728166, -0.4020843126537234,
        ..., 'Yes', '0-1 Miles', 'Europe'],
       [0.7680406235245242, 1.9024410936080116, 0.48748275073947, ...,
        'No', '2-5 Miles', 'Europe'],
       ...,
       [0.12425133305970927, 0.05660562115521903, -1.2916513760469168,
        ..., 'Yes', '0-1 Miles', 'North America'],
       [1.4118299139893389, 0.6718841119728166, 1.3770498141326635, ...,
        'No', '1-2 Miles', 'North America'],
       [0.12425133305970927, 0.6718841119728166, 0.48748275073947, ...,
        'Yes', '10+ Miles', 'North America']], dtype=object)

In [None]:
# нет удобной реализации - напишем сами !

from sklearn.base import TransformerMixin

class CustomScaler(TransformerMixin):
    def __init__(self, cols, scaler=None):
        self.cols = cols
        self.scaler = scaler or StandardScaler()

    def fit(self, X, y=None):
        num_cols = X.copy()[self.cols]
        self.scaler.fit(num_cols)
        return self
    def transform(self, X, y=None):
        X_res = X.copy()
        num_cols_tr = self.scaler.transform(X_res[self.cols])
        for i, col in enumerate(self.cols):
            X_res[col] = num_cols_tr[:,i]
        return X_res

In [None]:
sc = CustomScaler(num_cols)
X2 = sc.fit_transform(X)

In [None]:
X2

Unnamed: 0,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age
0,Married,Female,-0.519538,-0.558673,Bachelors,Skilled Manual,Yes,-1.291651,0-1 Miles,Europe,-0.192988
1,Married,Male,-0.841433,0.671884,Partial College,Clerical,Yes,-0.402084,0-1 Miles,Europe,-0.104866
2,Married,Male,0.768041,1.902441,Partial College,Professional,No,0.487483,2-5 Miles,Europe,1.393214
3,Single,Male,0.446146,-1.173951,Bachelors,Professional,Yes,-0.402084,5-10 Miles,Pacific,-0.281110
4,Single,Male,-0.841433,-1.173951,Bachelors,Clerical,No,-1.291651,0-1 Miles,Europe,-0.721722
...,...,...,...,...,...,...,...,...,...,...,...
995,Married,Male,0.124251,0.056606,High School,Professional,Yes,0.487483,2-5 Miles,North America,0.864480
996,Single,Male,0.446146,1.287163,Graduate Degree,Professional,Yes,-1.291651,2-5 Miles,North America,-0.809844
997,Married,Male,0.124251,0.056606,Bachelors,Skilled Manual,Yes,-1.291651,0-1 Miles,North America,-0.545477
998,Single,Male,1.411830,0.671884,Bachelors,Management,No,1.377050,1-2 Miles,North America,-0.545477


# Соберем все преобразования данных в pipeline

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

p1 = Pipeline([
    ('ordinal_encoder_', OrdinalEncoder(cols=ordinal_cols + binary_cols + cat_cols)), # плохо!!!
    ('scaler_', CustomScaler(num_cols)),
    ('model_', LogisticRegression())
    ])

p2 = Pipeline([
    ('one_hot_encoder_', OneHotEncoder(cols=ordinal_cols + binary_cols+cat_cols)),
    ('scaler_', CustomScaler(num_cols)),
    ('model_', LogisticRegression())
    ])

p3 = Pipeline([
    ('target_encoder_', TargetEncoder(cols=ordinal_cols + binary_cols+cat_cols)),
    ('scaler_', CustomScaler(num_cols)),
    ('model_', LogisticRegression())
])

p4 = Pipeline([
    ('ordinal_encoder_', OrdinalEncoder(cols=ordinal_cols)),
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols+cat_cols)),
    ('scaler_', CustomScaler(num_cols)),
    ('model_', LogisticRegression())
    ])

p5 = Pipeline([
    ('ordinal_encoder_', OrdinalEncoder(cols=ordinal_cols)),
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols)),
    ('target_encoder_', TargetEncoder(cols=cat_cols)),
    ('scaler_', CustomScaler(num_cols)),
    ('model_', LogisticRegression())
])

p6 = Pipeline([
    ('one_hot_encoder_', OneHotEncoder(cols=binary_cols)),
    ('target_encoder_', TargetEncoder(cols=cat_cols + ordinal_cols)),
    ('scaler_', CustomScaler(num_cols)),
    ('model_', LogisticRegression())
])

In [None]:
# пример работы с пайплайном
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y)

p1.fit(X_train, y_train)

#print(p1)

y_pred = p1.predict(X_test)

print(accuracy_score(y_test, y_pred))

  elif pd.api.types.is_categorical(cols):


0.632


# Сравнение качества классификации при разных пайплайнах преобразования данных

Вообще существует довольно большое количество метрик для задачи бинарной классификации.

Но для нашей задачи разберем самую простую и интуитивную метрику: accuracy

$accuracy = \frac{1}{n}\Sigma_{i=0}^n [\hat y_i == y_i]$

То есть доля правильных предсказаний

In [None]:
from sklearn.model_selection import cross_validate, cross_val_score
import warnings

warnings.filterwarnings('ignore')

In [None]:
for i, pipe in enumerate([p1, p2, p3, p4, p5, p6]):
    cv_res = cross_validate(pipe,
                            X,
                            y,
                            cv=5,
                            scoring='accuracy'
                           )
    print(f"Pipeline {i + 1}: mean cv accuracy = {cv_res['test_score'].mean()}")

Pipeline 1: mean cv accuracy = 0.606
Pipeline 2: mean cv accuracy = 0.616
Pipeline 3: mean cv accuracy = 0.629
Pipeline 4: mean cv accuracy = 0.617
Pipeline 5: mean cv accuracy = 0.619
Pipeline 6: mean cv accuracy = 0.6140000000000001


In [None]:
p3.fit(X_train, y_train)

pred = p3.predict(X_test)

accuracy_score(y_test, pred)

0.652

## Подбор порога

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score

confusion_matrix(y_test, pred), precision_score(y_test, pred), recall_score(y_test, pred)

(array([[98, 42],
        [45, 65]], dtype=int64),
 0.6074766355140186,
 0.5909090909090909)

А чего хотим?

Пусть хотим максимизировать полноту при accuracy >= 0.6

In [None]:
probs = p3.predict_proba(X_test)[:,1]

classes = probs > 0.25

In [None]:
confusion_matrix(y_test, classes), precision_score(y_test, classes), recall_score(y_test, classes), accuracy_score(y_test, classes)

(array([[  6, 134],
        [  0, 110]], dtype=int64),
 0.45081967213114754,
 1.0,
 0.464)

In [None]:
max_recall = -1
best_thr = -1
acc = -1

for thr in np.arange(0, 1, 0.01):
    classes = probs > thr

    if recall_score(y_test, classes) > max_recall and accuracy_score(y_test, classes) > 0.6:
        max_recall = recall_score(y_test, classes)
        best_thr = thr
        acc = accuracy_score(y_test, classes)

max_recall, best_thr, acc

(0.8454545454545455, 0.4, 0.616)

## Улучшаем качество модели

In [None]:
from sklearn.svm import SVC

p3_svm = Pipeline([
    ('target_encoder_', TargetEncoder(cols=ordinal_cols + binary_cols+cat_cols)),
    ('scaler_', CustomScaler(num_cols)),
    ('model_', SVC(kernel='linear'))
])

p3_svm.fit(X_train, y_train)

pred = p3_svm.predict(X_test)

accuracy_score(y_test, pred)

0.652

In [None]:
for kernel in ['linear', 'rbf', 'poly', 'sigmoid']:
    p3_svm = Pipeline([
    ('target_encoder_', TargetEncoder(cols=ordinal_cols + binary_cols+cat_cols)),
    ('scaler_', CustomScaler(num_cols)),
    ('model_', SVC(kernel=kernel))
    ])

    p3_svm.fit(X_train, y_train)

    pred = p3_svm.predict(X_test)

    print(kernel, accuracy_score(y_test, pred))

linear 0.652
rbf 0.672
poly 0.684
sigmoid 0.408


In [None]:
for degree in np.arange(2,10):
    p3_svm = Pipeline([
    ('target_encoder_', TargetEncoder(cols=ordinal_cols + binary_cols+cat_cols)),
    ('scaler_', CustomScaler(num_cols)),
    ('model_', SVC(kernel='poly', degree=degree))
    ])

    p3_svm.fit(X_train, y_train)

    pred = p3_svm.predict(X_test)

    print(degree, accuracy_score(y_test, pred))

2 0.656
3 0.684
4 0.66
5 0.704
6 0.684
7 0.692
8 0.684
9 0.696
