Расмотрим пример на датасете из репозитория UCI

Описание данных - https://archive.ics.uci.edu/ml/datasets/banknote+authentication#

In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

import xgboost as xgb
from catboost import CatBoostClassifier

In [None]:
pip install catboost

In [None]:
from google.colab import drive
drive.mount('gdrive')

Mounted at gdrive


In [None]:
data = pd.read_csv('/content/citrus.csv')
data.head(5)

Unnamed: 0,name,diameter,weight,red,green,blue
0,orange,2.96,86.76,172,85,2
1,orange,3.91,88.05,166,78,3
2,orange,4.42,95.17,156,81,2
3,orange,4.47,95.6,163,81,4
4,orange,4.48,95.76,161,72,9


Взял достаточно простой датасет, который можно найти тут https://www.kaggle.com/joshmcadams/oranges-vs-grapefruit

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      10000 non-null  int64  
 1   diameter  10000 non-null  float64
 2   weight    10000 non-null  float64
 3   red       10000 non-null  int64  
 4   green     10000 non-null  int64  
 5   blue      10000 non-null  int64  
dtypes: float64(2), int64(4)
memory usage: 468.9 KB


In [None]:
data.describe()

Unnamed: 0,diameter,weight,red,green,blue
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,9.975685,175.050792,153.8478,76.0106,11.3632
std,1.947844,29.212119,10.432954,11.708433,9.061275
min,2.96,86.76,115.0,31.0,2.0
25%,8.46,152.22,147.0,68.0,2.0
50%,9.98,174.985,154.0,76.0,10.0
75%,11.48,197.7225,161.0,84.0,17.0
max,16.45,261.51,192.0,116.0,56.0


In [None]:
data.loc[data['name'] == 'orange', 'name'] = 1
data.loc[data['name'] == 'grapefruit', 'name'] = 0

In [None]:
data.name = data.name.astype(int)

In [None]:
x_data = data.drop(['name'], 1)
y_data = data.name

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [None]:
y_data.value_counts()

True     5000
False    5000
Name: name, dtype: int64

## XGBoost

In [None]:
model_xgb = xgb.XGBClassifier()

model_xgb.fit(x_train, y_train)
y_predict_xgb = model_xgb.predict(x_test)

## CatBoost

In [None]:
model_cat = CatBoostClassifier(silent=True)

model_cat.fit(x_train, y_train)
y_predict_cat = model_cat.predict(x_test)

Проверяем качество

In [None]:
def evaluate_results(y_test, y_predict):
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    
print('XGB results:')
evaluate_results(y_test, y_predict_xgb)

print('\nCAT results:')
evaluate_results(y_test, y_predict_cat)

XGB results:
f1: 93.10%
roc: 93.10%
recall: 92.73%
precision: 93.47%

CAT results:
f1: 98.34%
roc: 98.35%
recall: 97.41%
precision: 99.29%


In [None]:
data = data[['diameter', 'weight', 'red', 'green', 'blue', 'name']]

### Теперь очередь за PU learning

Представим, что нам неизвестны негативы и часть позитивов

In [None]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.name.values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 1250/5000 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [None]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    8750
 1    1250
Name: class_test, dtype: int64


In [None]:
mod_data.head(10)

Unnamed: 0,name,diameter,weight,red,green,blue,class_test
0,1,2.96,86.76,172,85,2,-1
1,1,3.91,88.05,166,78,3,-1
2,1,4.42,95.17,156,81,2,-1
3,1,4.47,95.6,163,81,4,-1
4,1,4.48,95.76,161,72,9,-1
5,1,4.59,95.86,142,100,2,-1
6,1,4.64,97.94,156,85,2,1
7,1,4.65,98.5,142,74,2,-1
8,1,4.68,100.2,159,90,16,1
9,1,4.69,100.31,161,76,6,-1


In [None]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

### 1. random negative sampling

In [None]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(1250, 7) (1250, 7)


In [None]:
model_xgb_pu = xgb.XGBClassifier()

model_xgb_pu.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict_xgb_pu = model_xgb_pu.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict_xgb_pu)

f1: 90.88%
roc: 92.28%
recall: 96.74%
precision: 85.68%


In [None]:
model_cat_pu = CatBoostClassifier(silent=True)

model_cat_pu.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict_cat_pu = model_cat_pu.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict_cat_pu)

f1: 92.27%
roc: 93.50%
recall: 96.90%
precision: 88.07%


Точность упала, как ни странно, ведь мы дали меньше данных для обучения. Однако, модели все равно показывают довольно хороший результат

### Возьмем другую долю P

In [None]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.name.values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.5 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 2500/5000 as positives and unlabeling the rest


In [None]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    7500
 1    2500
Name: class_test, dtype: int64


In [None]:
mod_data.head(10)

Unnamed: 0,diameter,weight,red,green,blue,name,class_test
0,2.96,86.76,172,85,2,1,1
1,3.91,88.05,166,78,3,1,1
2,4.42,95.17,156,81,2,1,-1
3,4.47,95.6,163,81,4,1,1
4,4.48,95.76,161,72,9,1,1
5,4.59,95.86,142,100,2,1,-1
6,4.64,97.94,156,85,2,1,-1
7,4.65,98.5,142,74,2,1,1
8,4.68,100.2,159,90,16,1,1
9,4.69,100.31,161,76,6,1,1


In [None]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [None]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(2500, 7) (2500, 7)


In [None]:
model_xgb_pu_2 = xgb.XGBClassifier()

model_xgb_pu_2.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict_xgb_pu_2 = model_xgb_pu_2.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict_xgb_pu_2)

f1: 88.57%
roc: 93.06%
recall: 96.10%
precision: 82.14%


In [None]:
model_cat_pu_2 = CatBoostClassifier(silent=True)

model_cat_pu_2.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict_cat_pu_2 = model_cat_pu_2.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict_cat_pu_2)

f1: 93.84%
roc: 96.31%
recall: 97.59%
precision: 90.38%


## Дав модели больше данных p - видим увеличение метрик качетсва. (На xgb почему-то упал f1).