1. взять любой набор данных для бинарной классификации

In [69]:
import pandas as pd
import numpy as np
models_results = {
    'approach': [],
    'f1' : [], 
    'roc' : [],
    'recall': [],
    'precision': []
}
data = pd.read_csv("biodeg.csv", sep=";", header=None)
data.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,3.919,2.6909,0,0,0,0,0,31.4,2,0,...,0,0,0,2.949,1.591,0,7.253,0,0,RB
1,4.17,2.1144,0,0,0,0,0,30.8,1,1,...,0,0,0,3.315,1.967,0,7.257,0,0,RB
2,3.932,3.2512,0,0,0,0,0,26.7,2,4,...,0,0,1,3.076,2.417,0,7.601,0,0,RB


In [70]:
print(data.shape)

(1055, 42)


In [71]:
data.iloc[:, -1].value_counts()

NRB    699
RB     356
Name: 41, dtype: int64

2. сделать feature engineering

Тут я бы мог сумничать, но не буду: этот датасет уже прекрасен. Просто приведём target в бинарный вид:

In [72]:
data.loc[data.iloc[:, -1] == "RB", 41] = 1
data.loc[data.iloc[:, -1] == "NRB", 41] = 0
data.iloc[:, -1].value_counts()

0    699
1    356
Name: 41, dtype: int64

3. обучить любой классификатор (какой вам нравится)

In [73]:
from sklearn.model_selection import train_test_split

x_data = data.iloc[:,:-1]
y_data = data.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=23)

In [74]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(random_state = 23, verbose = False)

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

In [75]:
y_test = y_test.astype(int)
y_test

568    0
512    0
334    0
207    1
768    0
      ..
38     1
983    0
497    0
154    1
406    0
Name: 41, Length: 211, dtype: int32

In [76]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 
    return f1, roc, rec, prc
    
eval_res = evaluate_results(y_test, y_predict)

models_results['approach'].append('True Label')
models_results['f1'].append(f'{eval_res[0]:.4f}')
models_results['roc'].append(f'{eval_res[1]:.4f}')
models_results['recall'].append(f'{eval_res[2]:.4f}')
models_results['precision'].append(f'{eval_res[3]:.4f}')
models_results

Classification results:
f1: 75.50%
roc: 79.96%
recall: 67.06%
precision: 86.36%


{'approach': ['True Label'],
 'f1': ['0.7550'],
 'roc': ['0.7996'],
 'recall': ['0.6706'],
 'precision': ['0.8636']}

4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть

### Теперь очередь за PU learning

Представим, что нам неизвестны негативы и часть позитивов

In [77]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 89/356 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [78]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    966
 1     89
Name: class_test, dtype: int64


In [79]:
mod_data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,class_test
0,3.919,2.6909,0,0,0,0,0,31.4,2,0,...,0,0,2.949,1.591,0,7.253,0,0,1,-1
1,4.17,2.1144,0,0,0,0,0,30.8,1,1,...,0,0,3.315,1.967,0,7.257,0,0,1,-1
2,3.932,3.2512,0,0,0,0,0,26.7,2,4,...,0,1,3.076,2.417,0,7.601,0,0,1,-1
3,3.0,2.7098,0,0,0,0,0,20.0,0,2,...,0,1,3.046,5.0,0,6.69,0,0,1,-1
4,4.236,3.3944,0,0,0,0,0,29.4,2,4,...,0,0,3.351,2.405,0,8.003,0,0,1,-1
5,4.236,3.4286,0,0,0,0,0,28.6,2,4,...,0,0,3.351,2.556,0,7.904,0,0,1,1
6,5.0,5.0476,1,0,0,0,0,11.1,0,3,...,0,1,4.712,4.583,0,9.303,0,0,1,-1
7,4.525,3.8301,0,0,0,0,0,31.6,3,2,...,0,0,3.379,2.143,0,7.95,0,0,1,-1
8,4.596,3.0777,0,0,0,0,2,44.4,2,0,...,0,0,3.626,1.917,0,7.939,0,0,1,-1
9,5.04,3.6112,0,0,1,0,2,41.2,0,4,...,2,1,3.888,3.5,1,8.706,0,0,1,-1


Вижу, вижу, что кто-то сдался на половине перевода материала с английского =Ъ

Remember that this data frame (x_data) includes the former target variable that we keep here just to compare the results

[:-2] is the original class label for positive and negative data [:-1] is the new class for positive and unlabeled data

In [80]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

5. применить random negative sampling для построения классификатора в новых условиях

### 1. random negative sampling

In [81]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(89, 43) (89, 43)


6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)

In [82]:
model = CatBoostClassifier(random_state = 23)

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values, verbose = False)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
sample_test.iloc[:,-2] = sample_test.iloc[:,-2].astype(int)
eval_res = evaluate_results(sample_test.iloc[:,-2].values, y_predict)

models_results['approach'].append('RNS')
models_results['f1'].append(f'{eval_res[0]:.4f}')
models_results['roc'].append(f'{eval_res[1]:.4f}')
models_results['recall'].append(f'{eval_res[2]:.4f}')
models_results['precision'].append(f'{eval_res[3]:.4f}')
models_results

Classification results:
f1: 66.47%
roc: 79.72%
recall: 92.62%
precision: 51.83%


{'approach': ['True Label', 'RNS'],
 'f1': ['0.7550', '0.6647'],
 'roc': ['0.7996', '0.7972'],
 'recall': ['0.6706', '0.9262'],
 'precision': ['0.8636', '0.5183']}

7. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [84]:
# leave 50% of the positives marked
pos_sample_len = int(np.ceil(0.5 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1

x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

model = CatBoostClassifier(random_state = 23)

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values, verbose = False)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
sample_test.iloc[:,-2] = sample_test.iloc[:,-2].astype(int)
eval_res = evaluate_results(sample_test.iloc[:,-2].values, y_predict)

models_results['approach'].append('RNS_50%')
models_results['f1'].append(f'{eval_res[0]:.4f}')
models_results['roc'].append(f'{eval_res[1]:.4f}')
models_results['recall'].append(f'{eval_res[2]:.4f}')
models_results['precision'].append(f'{eval_res[3]:.4f}')
models_results

Using 178/356 as positives and unlabeling the rest
(178, 43) (178, 43)
Classification results:
f1: 59.90%
roc: 81.69%
recall: 89.63%
precision: 44.98%


{'approach': ['True Label', 'RNS', 'RNS_50%'],
 'f1': ['0.7550', '0.6647', '0.5990'],
 'roc': ['0.7996', '0.7972', '0.8169'],
 'recall': ['0.6706', '0.9262', '0.8963'],
 'precision': ['0.8636', '0.5183', '0.4498']}

In [87]:
# leave 10% of the positives marked
pos_sample_len = int(np.ceil(0.1 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1

x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

model = CatBoostClassifier(random_state = 23)

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values, verbose = False)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
sample_test.iloc[:,-2] = sample_test.iloc[:,-2].astype(int)
eval_res = evaluate_results(sample_test.iloc[:,-2].values, y_predict)

models_results['approach'].append('RNS_10%')
models_results['f1'].append(f'{eval_res[0]:.4f}')
models_results['roc'].append(f'{eval_res[1]:.4f}')
models_results['recall'].append(f'{eval_res[2]:.4f}')
models_results['precision'].append(f'{eval_res[3]:.4f}')
models_results

Using 36/356 as positives and unlabeling the rest
(36, 43) (36, 43)
Classification results:
f1: 67.22%
roc: 77.29%
recall: 92.56%
precision: 52.77%


{'approach': ['True Label', 'RNS', 'RNS_50%', 'RNS_10%'],
 'f1': ['0.7550', '0.6647', '0.5990', '0.6722'],
 'roc': ['0.7996', '0.7972', '0.8169', '0.7729'],
 'recall': ['0.6706', '0.9262', '0.8963', '0.9256'],
 'precision': ['0.8636', '0.5183', '0.4498', '0.5277']}

При увеличении P: recall - растёт, precision - падает. Можно было бы поискать оптимальное значение "методом локтя", но мне лень.