# **Задача look-alike**



1.   взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
2. сделать feature engineering
3. обучить любой классификатор (какой вам нравится)
4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
5. применить random negative sampling для построения классификатора в новых условиях
6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
7. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)


In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 21 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.3


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import catboost
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, precision_recall_curve
%matplotlib inline

Загрузка датасета <br>
https://archive.ics.uci.edu/ml/datasets/Adult

In [7]:
df = pd.read_csv("/content/drive/MyDrive/adult.data", header = None)
df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


Сведения о типах полей и пропусках в датасете

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       32561 non-null  int64 
 1   1       32561 non-null  object
 2   2       32561 non-null  int64 
 3   3       32561 non-null  object
 4   4       32561 non-null  int64 
 5   5       32561 non-null  object
 6   6       32561 non-null  object
 7   7       32561 non-null  object
 8   8       32561 non-null  object
 9   9       32561 non-null  object
 10  10      32561 non-null  int64 
 11  11      32561 non-null  int64 
 12  12      32561 non-null  int64 
 13  13      32561 non-null  object
 14  14      32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


Привидение целевой переменной к бинарному виду

In [9]:
df[14] = df[14].map({' <=50K': 0, ' >50K': 1}) 

Дисбаланс классов

In [10]:
df[14].value_counts()

0    24720
1     7841
Name: 14, dtype: int64

In [12]:
disbalance = y_train.value_counts()[0] / y_train.value_counts()[1]

Разбиение на test и train

In [11]:
X = df.copy()
X.drop(columns=14, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, df[14], test_size=0.3, shuffle=True, stratify=df[14], random_state=42)

Обучение модели catboost

In [14]:
cb = catboost.CatBoostClassifier(class_weights = [1, disbalance], silent = True, random_state = 42, cat_features = [1, 3, 5, 6, 7, 8, 9, 13], one_hot_max_size = 42 )
cb.fit(X_train, y_train)
y_pred_proba = cb.predict_proba(X_test)[:, 1]

Расчёт метрик

In [15]:
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
fscore = (2 * precision[:-10] * recall[:-10]) / (precision[:-10] + recall[:-10])
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f, ROC_AUC=%.3f' % (thresholds[ix], 
                                                                                      fscore[ix],
                                                                                      precision[ix],
                                                                                      recall[ix],
                                                                                     roc_auc_score(y_test, y_pred_proba)))

Best Threshold=0.653434, F-Score=0.737, Precision=0.713, Recall=0.763, ROC_AUC=0.930


**Random Negative Sampling**

Разделение набора данных на множество P и U

In [25]:
np.random.seed(42)
X = df.copy()
X.drop(columns=14, inplace=True)
y_true = df[14]
tp_index = np.array(y_true.loc[y_true==1].index)
np.random.shuffle(tp_index)

In [26]:
metrics = {
    'P_sample_size': [],
    'roc_auc': [],
    'best_threshold': [],
    'precision': [],
    'recall': [],
    'f1-score': []
}

In [27]:
for i in np.linspace(.1, .9, 9):
    p_index = tp_index[:int(i*len(tp_index))]
    y_pu = pd.Series([0]*len(y_true))
    y_pu.loc[p_index] = 1
    u_index = np.array(y_true.loc[~y_true.index.isin(p_index)].index)
    np.random.shuffle(u_index)
    test_index = u_index[:int(0.3*len(y_true))]
    train_index = np.concatenate((u_index[int(0.33*len(y_true)):], p_index))
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_pu_train = y_pu.loc[train_index]
    y_true_test = y_true.loc[test_index]

    cb = catboost.CatBoostClassifier(class_weights = [1, disbalance], silent = True, random_state = 42, cat_features = [1, 3, 5, 6, 7, 8, 9, 13], one_hot_max_size = 42 )
    cb.fit(X_train, y_pu_train)
    y_pred_proba = cb.predict_proba(X_test)[:, 1]

    precision, recall, thresholds = precision_recall_curve(y_true_test, y_pred_proba)
    fscore = (2 * precision[:-10] * recall[:-10]) / (precision[:-10] + recall[:-10])
    ix = np.argmax(fscore)

    metrics['P_sample_size'].append(i)
    metrics['roc_auc'].append(roc_auc_score(y_true_test, y_pred_proba))
    metrics['best_threshold'].append(thresholds[ix])
    metrics['precision'].append(precision[ix])
    metrics['recall'].append(recall[ix])
    metrics['f1-score'].append(fscore[ix])


Составление таблицы с метриками

In [28]:
report = pd.DataFrame(metrics)
report

Unnamed: 0,P_sample_size,roc_auc,best_threshold,precision,recall,f1-score
0,0.1,0.873741,0.064676,0.508973,0.798707,0.621743
1,0.2,0.89789,0.218096,0.588857,0.726908,0.65064
2,0.3,0.903312,0.324612,0.572072,0.722412,0.638512
3,0.4,0.920368,0.470245,0.608132,0.678947,0.641592
4,0.5,0.913977,0.564082,0.587198,0.657164,0.620214
5,0.6,0.923093,0.656181,0.571652,0.664845,0.614737
6,0.7,0.916178,0.803675,0.66562,0.508393,0.576479
7,0.8,0.922565,0.855505,0.59465,0.463884,0.52119
8,0.9,0.932833,0.902549,0.459854,0.456522,0.458182
