Расмотрим пример на датасете из репозитория UCI

Описание данных - https://archive.ics.uci.edu/ml/datasets/Audit+Data#

In [1]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report,\
precision_recall_curve, confusion_matrix

%matplotlib inline

In [2]:
data = pd.read_csv("audit_risk.csv")
data.head(3)

Unnamed: 0,Sector_score,LOCATION_ID,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,numbers,...,RiSk_E,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk
0,3.89,23,4.18,0.6,2.508,2.5,0.2,0.5,6.68,5.0,...,0.4,0,0.2,0.0,2.4,8.574,0.4,0.5,1.7148,1
1,3.89,6,0.0,0.2,0.0,4.83,0.2,0.966,4.83,5.0,...,0.4,0,0.2,0.0,2.0,2.554,0.4,0.5,0.5108,0
2,3.89,6,0.51,0.2,0.102,0.23,0.2,0.046,0.74,5.0,...,0.4,0,0.2,0.0,2.0,1.548,0.4,0.5,0.3096,0


У нас есть 1 категориальный признак, который я хочу удалить

In [3]:
data = data.drop(['LOCATION_ID'], axis=1)

In [4]:
print(data.shape)

(776, 26)


Посмотрим на соотношение классов

In [5]:
data.iloc[:, -1].value_counts()

0    471
1    305
Name: Risk, dtype: int64

Разбиваем выборку на тренировочную и тестовую части и обучаем модель (в примере - градиентный бустинг)

In [6]:
from sklearn.model_selection import train_test_split

x_data = data.iloc[:,:-1]
y_data = data.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [7]:
import xgboost as xgb

model = xgb.XGBClassifier()

model.fit(x_train, y_train)
y_predict = model.predict(x_test)





Проверяем качество

In [8]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    
evaluate_results(y_test, y_predict)

Classification results:
f1: 100.00%
roc: 100.00%
recall: 100.00%
precision: 100.00%


### Теперь очередь за PU learning

Представим, что нам неизвестны негативы и часть позитивов

In [27]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 77/305 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [28]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    699
 1     77
Name: class_test, dtype: int64


* We now have just 77 positive samples labeled as 1 in the 'class_test' col while the rest is unlabeled as -1.

In [23]:
mod_data.head(10)

Unnamed: 0,Sector_score,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,numbers,Score_B.1,...,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk,class_test
0,3.89,4.18,0.6,2.508,2.5,0.2,0.5,6.68,5.0,0.2,...,0,0.2,0.0,2.4,8.574,0.4,0.5,1.7148,1,-1
1,3.89,0.0,0.2,0.0,4.83,0.2,0.966,4.83,5.0,0.2,...,0,0.2,0.0,2.0,2.554,0.4,0.5,0.5108,0,-1
2,3.89,0.51,0.2,0.102,0.23,0.2,0.046,0.74,5.0,0.2,...,0,0.2,0.0,2.0,1.548,0.4,0.5,0.3096,0,-1
3,3.89,0.0,0.2,0.0,10.8,0.6,6.48,10.8,6.0,0.6,...,0,0.2,0.0,4.4,17.53,0.4,0.5,3.506,1,-1
4,3.89,0.0,0.2,0.0,0.08,0.2,0.016,0.08,5.0,0.2,...,0,0.2,0.0,2.0,1.416,0.4,0.5,0.2832,0,-1
5,3.89,0.0,0.2,0.0,0.83,0.2,0.166,0.83,5.0,0.2,...,0,0.2,0.0,2.0,2.156,0.4,0.5,0.4312,0,-1
6,3.89,1.1,0.4,0.44,7.41,0.4,2.964,8.51,5.0,0.2,...,0,0.2,0.0,3.2,31.774,0.4,0.5,6.3548,1,-1
7,3.89,8.5,0.6,5.1,12.03,0.6,7.218,20.53,5.5,0.4,...,0,0.2,0.0,4.2,18.034,0.4,0.5,3.6068,1,-1
8,3.89,8.4,0.6,5.04,11.05,0.6,6.63,19.45,5.5,0.4,...,0,0.2,0.0,4.2,17.206,0.4,0.5,3.4412,1,-1
9,3.89,3.98,0.6,2.388,0.99,0.2,0.198,4.97,5.0,0.2,...,0,0.2,0.0,2.4,4.372,0.4,0.5,0.8744,0,-1


Remember that this data frame (x_data) includes the former target variable that we keep here just to compare the results

[:-2] is the original class label for positive and negative data [:-1] is the new class for positive and unlabeled data

In [29]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

### 1. random negative sampling

In [30]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(77, 27) (77, 27)


In [31]:
model = xgb.XGBClassifier()

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 99.75%
roc: 99.88%
recall: 100.00%
precision: 99.50%




### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

Думаю, что взял не самый удачный датасет для экспериментов.