## Lesson 6 HomeWork

In [89]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

import catboost as ctb

### 1. Взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)

In [90]:
df_import = pd.read_csv('bank-full.csv', sep=';')

In [91]:
df = df_import.copy()

In [92]:
df.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no


In [93]:
df.shape

(45211, 17)

In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [95]:
df.y.replace(('yes', 'no'), (1, 0), inplace=True)

In [96]:
df.y.value_counts()

0    39922
1     5289
Name: y, dtype: int64

In [97]:
df.rename({'y': 'target'}, axis=1, inplace=True)

Пропусков нет, присутствует дисбаланс классов целевой переменной.

### 2. Сделать feature engineering

In [98]:
for i in ['housing', 'loan', 'default']:
    df[i].replace(('yes', 'no'), (1, 0), inplace=True)

In [99]:
df = df.drop(columns=['poutcome', 'day', 'contact'])

In [100]:
df.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,month,duration,campaign,pdays,previous,target
0,58,management,married,tertiary,0,2143,1,0,may,261,1,-1,0,0
1,44,technician,single,secondary,0,29,1,0,may,151,1,-1,0,0
2,33,entrepreneur,married,secondary,0,2,1,1,may,76,1,-1,0,0
3,47,blue-collar,married,unknown,0,1506,1,0,may,92,1,-1,0,0
4,33,unknown,single,unknown,0,1,0,0,may,198,1,-1,0,0
5,35,management,married,tertiary,0,231,1,0,may,139,1,-1,0,0
6,28,management,single,tertiary,0,447,1,1,may,217,1,-1,0,0
7,42,entrepreneur,divorced,tertiary,1,2,1,0,may,380,1,-1,0,0
8,58,retired,married,primary,0,121,1,0,may,50,1,-1,0,0
9,43,technician,single,secondary,0,593,1,0,may,55,1,-1,0,0


In [101]:
# Делим на train/test
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], stratify=df['target'], test_size=0.3, random_state=42)

### 3. Обучить любой классификатор (какой вам нравится)

In [102]:
cat_feats = ['job', 'marital', 'education', 'month']

In [103]:
model = ctb.CatBoostClassifier(cat_features=cat_feats)
model.fit(X_train, y_train, verbose=False)

<catboost.core.CatBoostClassifier at 0x2301d756160>

In [104]:
y_pred = model.predict(X_test)
    
f1 = f1_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='binary')
rec = recall_score(y_test, y_pred, average='binary')

In [105]:
metrics = pd.DataFrame({'f1': [f1], 'roc-auc': [roc], 'precision': [prec], 'recall': [rec]})

In [106]:
metrics

Unnamed: 0,f1,roc-auc,precision,recall
0,0.505096,0.694369,0.629944,0.42155


### 4. Далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть

In [107]:
df_sample = df.copy()

In [108]:
pos_ind = df_sample.loc[df_sample['target'] == 1].sample(frac=0.2).index
unlab_ind = df_sample[~df.index.isin(pos_ind)].index

df_sample.loc[pos_ind, 'is_labeled'] = 1
df_sample.loc[unlab_ind, 'is_labeled'] = 0
df_sample['is_labeled'] = df_sample['is_labeled'].astype(int)

In [109]:
df_sample.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,month,duration,campaign,pdays,previous,target,is_labeled
0,58,management,married,tertiary,0,2143,1,0,may,261,1,-1,0,0,0
1,44,technician,single,secondary,0,29,1,0,may,151,1,-1,0,0,0
2,33,entrepreneur,married,secondary,0,2,1,1,may,76,1,-1,0,0,0
3,47,blue-collar,married,unknown,0,1506,1,0,may,92,1,-1,0,0,0
4,33,unknown,single,unknown,0,1,0,0,may,198,1,-1,0,0,0
5,35,management,married,tertiary,0,231,1,0,may,139,1,-1,0,0,0
6,28,management,single,tertiary,0,447,1,1,may,217,1,-1,0,0,0
7,42,entrepreneur,divorced,tertiary,1,2,1,0,may,380,1,-1,0,0,0
8,58,retired,married,primary,0,121,1,0,may,50,1,-1,0,0,0
9,43,technician,single,secondary,0,593,1,0,may,55,1,-1,0,0,0


### 5. Применить random negative sampling для построения классификатора в новых условиях

In [110]:
rns_df = df_sample.sample(frac=1)

pos_sample = rns_df[rns_df['is_labeled'] == 1]
neg_sample = rns_df[rns_df['is_labeled'] == 0][:pos_sample.shape[0]]

train_samples = pd.concat([neg_sample, pos_sample]).sample(frac=1)
test_samples = rns_df[rns_df['is_labeled'] == 0][pos_sample.shape[0]:]

In [111]:
model = ctb.CatBoostClassifier(cat_features=cat_feats)
model.fit(train_samples.iloc[:, :-2],
          train_samples['is_labeled'], verbose=False)

<catboost.core.CatBoostClassifier at 0x2301d756220>

In [112]:
y_pred = model.predict(test_samples.iloc[:, :-2])
    
f1 = f1_score(test_samples['target'], y_pred)
roc = roc_auc_score(test_samples['target'], y_pred)
prec = precision_score(test_samples['target'], y_pred, average='binary')
rec = recall_score(test_samples['target'], y_pred, average='binary')

In [113]:
metrics = metrics.append(pd.DataFrame({'f1': [f1], 'roc-auc': [roc], 'precision': [prec], 'recall': [rec]}))

### 6. Сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)

In [114]:
metrics

Unnamed: 0,f1,roc-auc,precision,recall
0,0.505096,0.694369,0.629944,0.42155
0,0.463695,0.82809,0.31936,0.846079


По roc-auc RNS оказался лучше чем обычная модель. У RNS сильно увеличился recall и упал precision.

### 7. Поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [115]:
rns_metrics = pd.DataFrame(columns=['f1', 'roc-auc', 'precision', 'recall'])

fracs = np.linspace(0.1, 0.9, 9)

for frac in fracs:
    df_sample = df.copy()

    pos_ind = df_sample.loc[df_sample['target'] == 1].sample(frac=frac).index
    unlab_ind = df_sample[~df.index.isin(pos_ind)].index

    df_sample.loc[pos_ind, 'is_labeled'] = 1
    df_sample.loc[unlab_ind, 'is_labeled'] = 0
    df_sample['is_labeled'] = df_sample['is_labeled'].astype(int)

    rns_df = df_sample.sample(frac=1)

    pos_sample = rns_df[rns_df['is_labeled'] == 1]
    neg_sample = rns_df[rns_df['is_labeled'] == 0][:pos_sample.shape[0]]
    train_samples = pd.concat([neg_sample, pos_sample]).sample(frac=1)
    test_samples = rns_df[rns_df['is_labeled'] == 0][pos_sample.shape[0]:]

    model = ctb.CatBoostClassifier(cat_features=cat_feats)
    model.fit(train_samples.iloc[:, :-2],
              train_samples['is_labeled'], verbose=False)
    
    y_pred = model.predict(test_samples.iloc[:, :-2])
    
    f1 = f1_score(test_samples['target'], y_pred)
    roc = roc_auc_score(test_samples['target'], y_pred)
    prec = precision_score(test_samples['target'], y_pred, average='binary')
    rec = recall_score(test_samples['target'], y_pred, average='binary')

    rns_metrics = rns_metrics.append(pd.DataFrame({'f1': [f1], 'roc-auc': [roc], 'precision': [prec], 'recall': [rec]}))

In [116]:
rns_metrics.index = fracs
rns_metrics

Unnamed: 0,f1,roc-auc,precision,recall
0.1,0.482636,0.8197,0.339766,0.832838
0.2,0.472353,0.827973,0.328965,0.837322
0.3,0.439076,0.833972,0.294763,0.860245
0.4,0.425044,0.838206,0.283779,0.846358
0.5,0.392347,0.843497,0.254942,0.851012
0.6,0.336358,0.848307,0.208359,0.872123
0.7,0.272244,0.843999,0.16146,0.867403
0.8,0.203222,0.844475,0.115136,0.864979
0.9,0.116659,0.843036,0.062587,0.857447


Roc-auc меняется незначительно. F1 выше всего при сэмплинге 10-30%.