### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

Расмотрим пример на датасете из репозитория UCI

Описание данных - https://archive.ics.uci.edu/ml/datasets/ionosphere

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd; pd.set_option('display.max_columns', None)
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import  precision_score, roc_auc_score, accuracy_score, f1_score

from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score, classification_report, precision_recall_curve


class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]


class RenameKey(BaseEstimator, TransformerMixin):
    def __init__(self, new_old_keys_dict):
        self.new_old_keys_dict = new_old_keys_dict

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.rename(self.new_old_keys_dict, axis='columns')
        return X
    
    
class BinEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key, neg_class=None):
        self.key = key
        self.neg_class = neg_class

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.neg_class in X[self.key].unique():
            X.loc[(X[self.key] != self.neg_class), self.key ] = 1
            X.loc[(X[self.key] == self.neg_class), self.key ] = 0
            X[self.key] = pd.to_numeric(X[self.key])
            return X[[self.key]]

        

def evaluate_results(y_test, y_predict, fstr=True):
    f1 = f1_score(y_test, y_predict)
    roc = roc_auc_score(y_test, y_predict)
    prc = precision_score(y_test, y_predict, average='binary')
    rec = recall_score(y_test, y_predict, average='binary')
    if fstr:
        return (f'Classification results:\n'
                f'F1_Score: \t{(f1*100.0):.3f}%\n'
                f'Roc_AUC: \t{(roc*100.0):.3f}%\n'
                f'Precision: \t{(prc*100.0):.3f}%\n'
                f'Recall: \t{(rec*100.0):.3f}%')
    else:
        return None, f1, roc, prc, rec
        

    
def get_metrics(y_test, probs, fstr=True):
    """
    Функция перехода от вероятностей к меткам классов.
    Для этого нужно подобрать порог - Best_Threshold={thresholds[ix]:.3f},
    после которого мы считаем,
    что объект можно отнести к классу 1 
    (если вероятность больше порога -
    размечаем объект как класс 1,
    если нет - класс 0)

    Args:
        y_test ([type]): [Истинные классы]
        probs ([type]): [Предсказанные вероятности принадлежности к классу]
        fstr (bool, optional): [флаг вывода]. Defaults to True.

    Returns:
        if fstr is True:
            [f'str']: [Выводиться f-string в виде: 
                        f'Best_Threshold={thresholds[ix]:.3f},\n'
                        f'F_Score={fscore[ix]:.3f},\n'
                        f'Precision={precision[ix]:.3f},\n'
                        f'Recall={recall[ix]:.3f},\n'
                        f'Roc_AUC={roc_auc_score(y_test, probs)}']
        else:
            [tuple]: [(
                       thresholds[ix]: float,
                       fscore[ix]: float,
                       precision[ix]: float,
                       recall[ix]: float,
                       roc_auc_score(y_test, probs): float
                       )]
    """
    precision, recall, thresholds = precision_recall_curve(y_test, probs)

    fscore = (2 * precision * recall) / (precision + recall)
    roc = roc_auc_score(y_test, probs)
    ix = np.argmax(fscore)
    if fstr:
        return(f'Best_Threshold:\t{thresholds[ix]:.3f},\n'
               f'F1_Score:\t{(fscore[ix]*100.0):.3f}%,\n'
               f'Roc_AUC:\t{(roc*100.0):.3f}%,\n'
               f'Precision:\t{(precision[ix]*100.0):.3f}%,\n'
               f'Recall: \t{(recall[ix]*100.0):.3f}%')
    else:
        return thresholds[ix], fscore[ix], roc, precision[ix], recall[ix]

In [2]:
with open("datasets/ionosphere_description.txt") as file:
    data_description = file.read()
print(data_description)

1. Title: Johns Hopkins University Ionosphere database

2. Source Information:
   -- Donor: Vince Sigillito (vgs@aplcen.apl.jhu.edu)
   -- Date: 1989
   -- Source: Space Physics Group
              Applied Physics Laboratory
              Johns Hopkins University
              Johns Hopkins Road
              Laurel, MD 20723 

3. Past Usage:
   -- Sigillito, V. G., Wing, S. P., Hutton, L. V., \& Baker, K. B. (1989).
      Classification of radar returns from the ionosphere using neural 
      networks. Johns Hopkins APL Technical Digest, 10, 262-266.

      They investigated using backprop and the perceptron training algorithm
      on this database.  Using the first 200 instances for training, which
      were carefully split almost 50% positive and 50% negative, they found
      that a "linear" perceptron attained 90.7%, a "non-linear" perceptron
      attained 92%, and backprop an average of over 96% accuracy on the 
      remaining 150 test instances, consisting of 123 "good" and 

In [3]:
data = pd.read_csv("datasets/ionosphere_data.txt", header=None)
data.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,0.85243,-0.17755,0.59755,-0.44945,0.60536,-0.38223,0.84356,-0.38542,0.58212,-0.32192,0.56971,-0.29674,0.36946,-0.47357,0.56811,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,g
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,0.50874,-0.67743,0.34432,-0.69707,-0.51685,-0.97515,0.05499,-0.62237,0.33109,-1.0,-0.13151,-0.453,-0.18056,-0.35734,-0.20332,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,0.73082,0.05346,0.85443,0.00827,0.54591,0.00299,0.83775,-0.13644,0.75535,-0.0854,0.70887,-0.27502,0.43385,-0.12062,0.57528,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,g


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 35 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       351 non-null    int64  
 1   1       351 non-null    int64  
 2   2       351 non-null    float64
 3   3       351 non-null    float64
 4   4       351 non-null    float64
 5   5       351 non-null    float64
 6   6       351 non-null    float64
 7   7       351 non-null    float64
 8   8       351 non-null    float64
 9   9       351 non-null    float64
 10  10      351 non-null    float64
 11  11      351 non-null    float64
 12  12      351 non-null    float64
 13  13      351 non-null    float64
 14  14      351 non-null    float64
 15  15      351 non-null    float64
 16  16      351 non-null    float64
 17  17      351 non-null    float64
 18  18      351 non-null    float64
 19  19      351 non-null    float64
 20  20      351 non-null    float64
 21  21      351 non-null    float64
 22  22

In [5]:
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33
count,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
mean,0.891738,0.0,0.641342,0.044372,0.601068,0.115889,0.550095,0.11936,0.511848,0.181345,0.476183,0.15504,0.400801,0.093414,0.344159,0.071132,0.381949,-0.003617,0.35939,-0.024025,0.336695,0.008296,0.362475,-0.057406,0.396135,-0.071187,0.541641,-0.069538,0.378445,-0.027907,0.352514,-0.003794,0.349364,0.01448
std,0.311155,0.0,0.497708,0.441435,0.519862,0.46081,0.492654,0.52075,0.507066,0.483851,0.563496,0.494817,0.622186,0.494873,0.652828,0.458371,0.61802,0.496762,0.626267,0.519076,0.609828,0.518166,0.603767,0.527456,0.578451,0.508495,0.516205,0.550025,0.575886,0.507974,0.571483,0.513574,0.522663,0.468337
min,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,1.0,0.0,0.472135,-0.064735,0.41266,-0.024795,0.21131,-0.05484,0.08711,-0.048075,0.02112,-0.065265,0.0,-0.073725,0.0,-0.081705,0.0,-0.22569,0.0,-0.23467,0.0,-0.24387,0.0,-0.366885,0.0,-0.33239,0.286435,-0.443165,0.0,-0.236885,0.0,-0.242595,0.0,-0.16535
50%,1.0,0.0,0.87111,0.01631,0.8092,0.0228,0.72873,0.01471,0.68421,0.01829,0.66798,0.02825,0.64407,0.03027,0.60194,0.0,0.59091,0.0,0.57619,0.0,0.49909,0.0,0.53176,0.0,0.55389,-0.01505,0.70824,-0.01769,0.49664,0.0,0.44277,0.0,0.40956,0.0
75%,1.0,0.0,1.0,0.194185,1.0,0.334655,0.96924,0.445675,0.95324,0.534195,0.957895,0.482375,0.955505,0.37486,0.91933,0.308975,0.935705,0.195285,0.899265,0.13437,0.894865,0.18876,0.911235,0.16463,0.90524,0.156765,0.999945,0.153535,0.883465,0.154075,0.85762,0.20012,0.813765,0.17166
max,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Наш таргет
    "good" or "bad"
    g - good
    b - bad

In [6]:
data[34].unique()

array(['g', 'b'], dtype=object)

In [7]:
df_base = data.copy()
df_base.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,0.85243,-0.17755,0.59755,-0.44945,0.60536,-0.38223,0.84356,-0.38542,0.58212,-0.32192,0.56971,-0.29674,0.36946,-0.47357,0.56811,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,g
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,0.50874,-0.67743,0.34432,-0.69707,-0.51685,-0.97515,0.05499,-0.62237,0.33109,-1.0,-0.13151,-0.453,-0.18056,-0.35734,-0.20332,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,0.73082,0.05346,0.85443,0.00827,0.54591,0.00299,0.83775,-0.13644,0.75535,-0.0854,0.70887,-0.27502,0.43385,-0.12062,0.57528,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,g


In [8]:
rename_feture = RenameKey({34: 'target'})

In [9]:
df_base = rename_feture.fit_transform(df_base)

In [10]:
target_bin = BinEncoder(key='target', neg_class='b')
target_bin.fit_transform(df_base).head(3)

Unnamed: 0,target
0,1
1,0
2,1


У нас есть 34 признака и 1 целевая переменная (бинарная) - нужно определить хороший сигнал или нет

In [11]:
print(df_base.shape)

(351, 35)


Всего 351 сигнал

Посмотрим на соотношение классов

In [12]:
df_base.iloc[:, -1].value_counts()

1    225
0    126
Name: target, dtype: int64

In [13]:
continuos_features = Pipeline([
                ('selector', ColumnSelector(key=[_ for _ in range(34)]))
            ])

feats = FeatureUnion([('continuos_features', continuos_features)])

feature_processing = Pipeline([('feats', feats)])

pipeline = Pipeline([
    ('features', feats),
    ('classifier', xgb.XGBClassifier(random_state = 21)),
])

In [14]:
X_data = df_base.iloc[:,:-1]
y_data = df_base.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=21)

In [15]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('continuos_features',
                                                 Pipeline(steps=[('selector',
                                                                  ColumnSelector(key=[0,
                                                                                      1,
                                                                                      2,
                                                                                      3,
                                                                                      4,
                                                                                      5,
                                                                                      6,
                                                                                      7,
                                                                                      8,
                     

In [16]:
y_predict = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)
print(evaluate_results(y_test, y_predict))

Classification results:
F1_Score: 	93.023%
Roc_AUC: 	91.751%
Precision: 	95.238%
Recall: 	90.909%


In [17]:
thresholds, fscore, roc, precision, recall = evaluate_results(y_test, y_predict, fstr=False)
xgbc_score = {"XGBClassifier":pd.Series([thresholds,
                                         fscore,
                                         precision,
                                         recall,
                                         roc,],
                                        index=['Threshold',
                                               'F-Score',
                                               'ROC_AUC',
                                               'Precision',
                                               'Recall'
                                              ])}
df_score = pd.DataFrame(xgbc_score)

In [18]:
df_score

Unnamed: 0,XGBClassifier
Threshold,
F-Score,0.930233
ROC_AUC,0.952381
Precision,0.909091
Recall,0.917508


In [19]:
print(get_metrics(y_test, y_pred_proba[:, 1]))

Best_Threshold:	0.148,
F1_Score:	94.382%,
Roc_AUC:	98.401%,
Precision:	93.333%,
Recall: 	95.455%


In [20]:
thresholds, fscore, roc, precision, recall = get_metrics(y_test, y_pred_proba[:, 1], fstr=False)
xgbc_score_prob = {"XGBClassifier_probs":pd.Series([thresholds,
                                                    fscore,
                                                    roc,
                                                    precision,
                                                    recall,
                                                   ],
                                                   index=['Threshold',
                                                          'F-Score',
                                                          'ROC_AUC',
                                                          'Precision',
                                                          'Recall'
                                                         ])}
df_score = df_score.join(pd.DataFrame(xgbc_score_prob))
df_score

Unnamed: 0,XGBClassifier,XGBClassifier_probs
Threshold,,0.148428
F-Score,0.930233,0.94382
ROC_AUC,0.952381,0.984007
Precision,0.909091,0.933333
Recall,0.917508,0.954545


Разбиваем выборку на тренировочную и тестовую части и обучаем модель (в примере - градиентный бустинг)

### Теперь очередь за PU learning

Представим, что нам неизвестны негативы и часть позитивов

In [21]:
mod_data = df_base.copy()
# получаем индексы положительных сигналов
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
# перемешиваем их
np.random.shuffle(pos_ind)

# оставляем только 25% отмеченными положительным классом
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
pos_sample = pos_ind[:pos_sample_len]

print(f'Используем {pos_sample_len}/{len(pos_ind)} как положительные и остальные как немаркированных')

Используем 57/225 как положительные и остальные как немаркированных


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [22]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    294
 1     57
Name: class_test, dtype: int64


* Мы получили 57 положительных(1) сигналов в созданой новой целевой колонке 'class_test' остальные сигналы отмечены как немаркированные(-1)

* колонка target остается в датасете для сравнения

In [23]:
mod_data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,target,class_test
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,0.85243,-0.17755,0.59755,-0.44945,0.60536,-0.38223,0.84356,-0.38542,0.58212,-0.32192,0.56971,-0.29674,0.36946,-0.47357,0.56811,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,1,-1
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,0.50874,-0.67743,0.34432,-0.69707,-0.51685,-0.97515,0.05499,-0.62237,0.33109,-1.0,-0.13151,-0.453,-0.18056,-0.35734,-0.20332,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,0,-1
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,0.73082,0.05346,0.85443,0.00827,0.54591,0.00299,0.83775,-0.13644,0.75535,-0.0854,0.70887,-0.27502,0.43385,-0.12062,0.57528,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,1,-1
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.14516,0.54094,-0.3933,-1.0,-0.54467,-0.69975,1.0,0.0,0.0,1.0,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,0,-1
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,0.52798,-0.20275,0.56409,-0.00712,0.34395,-0.27457,0.5294,-0.2178,0.45107,-0.17813,0.05982,-0.35575,0.02309,-0.52879,0.03286,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,1,-1
5,1,0,0.02337,-0.00592,-0.09924,-0.11949,-0.00763,-0.11824,0.14706,0.06637,0.03786,-0.06302,0.0,0.0,-0.04572,-0.1554,-0.00343,-0.10196,-0.11575,-0.05414,0.01838,0.03669,0.01519,0.00888,0.03513,-0.01535,-0.0324,0.09223,-0.07859,0.00732,0.0,0.0,-0.00039,0.12011,0,-1
6,1,0,0.97588,-0.10602,0.94601,-0.208,0.92806,-0.2835,0.85996,-0.27342,0.79766,-0.47929,0.78225,-0.50764,0.74628,-0.61436,0.57945,-0.68086,0.37852,-0.73641,0.36324,-0.76562,0.31898,-0.79753,0.22792,-0.81634,0.13659,-0.8251,0.04606,-0.82395,-0.04262,-0.81318,-0.13832,-0.80975,1,-1
7,0,0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0,-1
8,1,0,0.96355,-0.07198,1.0,-0.14333,1.0,-0.21313,1.0,-0.36174,0.9257,-0.43569,0.9451,-0.40668,0.90392,-0.46381,0.98305,-0.35257,0.84537,-0.6602,0.75346,-0.60589,0.69637,-0.64225,0.85106,-0.6544,0.57577,-0.69712,0.25435,-0.63919,0.45114,-0.72779,0.38895,-0.7342,1,1
9,1,0,-0.01864,-0.08459,0.0,0.0,0.0,0.0,0.1147,-0.2681,-0.45663,-0.38172,0.0,0.0,-0.33656,0.38602,-0.37133,0.15018,0.63728,0.22115,0.0,0.0,0.0,0.0,-0.14803,-0.01326,0.20645,-0.02294,0.0,0.0,0.16595,0.24086,-0.08208,0.38065,0,-1


в слайсах датафрейма :

    [: -2] - это исходная метка класса для положительных и отрицательных данных,
    [: -1] - это новый класс для положительных и немаркированных данных. 

In [24]:
x_data = mod_data.iloc[:,:-2].values # признаки датасета
y_labeled = mod_data.iloc[:,-1].values # Новый таргет класс(только P & U)
y_positive = mod_data.iloc[:,-2].values # изначальный таргет класс
n_uclass = len(mod_data[mod_data['class_test']==-1]) # колличество немаркированых сигналов(строк)
n_pclass = len(mod_data[mod_data['class_test']==1]) # колличество положительных сигналов(строк)
n_uclass, n_pclass

(294, 57)

### 1. random negative sampling

In [25]:
# перемешивает датасет
mod_data = mod_data.sample(frac=1)
# к отрицательному классу относим срез по немаркированому датасету
# в количестве равном количеству положительных сигналов(строк)
neg_sample = mod_data[mod_data['class_test']==-1][:n_pclass]
# выделяем остальной немаркированный датасет для теста 
sample_test = mod_data[mod_data['class_test']==-1][n_pclass:]
X_sample_test = sample_test.iloc[:,:-2]
y_sample_test = sample_test.iloc[:,-2]
# положительный датасет
pos_sample = mod_data[mod_data['class_test']==1]
# соединяем положительный и отрицательный датасеты с перемешиванием
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)
X_sample_train = sample_train.iloc[:,:-2]
y_sample_train= sample_train.iloc[:,-2]

print(neg_sample.shape, pos_sample.shape)

(57, 36) (57, 36)


In [26]:
pipeline_pu = Pipeline([
    ('features', feats),
    ('classifier', xgb.XGBClassifier(random_state = 21)),
])

In [27]:
pipeline_pu.fit(X_sample_train, y_sample_train)

y_predict_pu = pipeline_pu.predict(X_sample_test)
y_pred_proba_pu = pipeline_pu.predict_proba(X_sample_test)


In [28]:
print(evaluate_results(y_sample_test, y_predict_pu))

Classification results:
F1_Score: 	89.610%
Roc_AUC: 	83.824%
Precision: 	81.657%
Recall: 	99.281%


In [29]:
thresholds, fscore, roc, precision, recall = evaluate_results(y_sample_test, y_predict_pu, fstr=False)
xgbc_score_pu = {"XGBClassifier_rand_neg_sampl":pd.Series([thresholds,
                                                    fscore,
                                                    roc,
                                                    precision,
                                                    recall,
                                                   ],
                                                   index=['Threshold',
                                                          'F-Score',
                                                          'ROC_AUC',
                                                          'Precision',
                                                          'Recall'
                                                         ])}
df_score = df_score.join(pd.DataFrame(xgbc_score_pu))
df_score

Unnamed: 0,XGBClassifier,XGBClassifier_probs,XGBClassifier_rand_neg_sampl
Threshold,,0.148428,
F-Score,0.930233,0.94382,0.896104
ROC_AUC,0.952381,0.984007,0.83824
Precision,0.909091,0.933333,0.816568
Recall,0.917508,0.954545,0.992806


In [30]:
print(get_metrics(y_sample_test, y_pred_proba_pu[:, 1]))

Best_Threshold:	0.855,
F1_Score:	91.289%,
Roc_AUC:	93.804%,
Precision:	88.514%,
Recall: 	94.245%


In [31]:
thresholds, roc, fscore, precision, recall = get_metrics(y_sample_test, y_pred_proba_pu[:, 1], fstr=False)
xgbc_score_prob = {"XGBClassifier_probs_rand_neg_sampl":pd.Series([thresholds,
                                                    fscore,
                                                    roc,
                                                    precision,
                                                    recall,
                                                   ],
                                                   index=['Threshold',
                                                          'F-Score',
                                                          'ROC_AUC',
                                                          'Precision',
                                                          'Recall'
                                                         ])}
df_score = df_score.join(pd.DataFrame(xgbc_score_prob))
df_score

Unnamed: 0,XGBClassifier,XGBClassifier_probs,XGBClassifier_rand_neg_sampl,XGBClassifier_probs_rand_neg_sampl
Threshold,,0.148428,,0.855133
F-Score,0.930233,0.94382,0.896104,0.938041
ROC_AUC,0.952381,0.984007,0.83824,0.912892
Precision,0.909091,0.933333,0.816568,0.885135
Recall,0.917508,0.954545,0.992806,0.942446


<b>Бонусный вопрос:</b>

Как вы думаете, какой из методов на практике является более предпочтительным: random negative sampling или 2-step approach?

Ваш ответ здесь:

### random negative sampling  - для быстрого расчета
### 2-step approach - для более точного