### Урок 6. #Задача lookalike (Positive Unlabeled Learning)#

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

Описание данных - https://archive.ics.uci.edu/ml/datasets/Wilt

In [190]:
import pandas as pd
import numpy as np

data_train = pd.read_csv("./training.csv")
data_train.head(5)

Unnamed: 0,class,GLCM_pan,Mean_Green,Mean_Red,Mean_NIR,SD_pan
0,w,120.362774,205.5,119.395349,416.581395,20.676318
1,w,124.739583,202.8,115.333333,354.333333,16.707151
2,w,134.691964,199.285714,116.857143,477.857143,22.496712
3,w,127.946309,178.368421,92.368421,278.473684,14.977453
4,w,135.431548,197.0,112.690476,532.952381,17.604193


In [191]:
data_test = pd.read_csv("./testing.csv")
data_test.head(5)

Unnamed: 0,class,GLCM_pan,Mean_Green,Mean_Red,Mean_NIR,SD_pan
0,n,109.828571,183.7,82.95,251.75,16.079412
1,n,130.284483,212.637931,96.896552,482.396552,21.210295
2,n,131.386555,185.466667,85.466667,419.666667,13.339998
3,n,141.345098,180.875,81.5,348.0625,18.213577
4,w,121.383408,218.357143,112.017857,426.607143,19.083196


In [192]:
print(data_train.shape)
print(data_test.shape)
print(data_train['class'].value_counts())
print(data_test['class'].value_counts())

(4339, 6)
(500, 6)
n    4265
w      74
Name: class, dtype: int64
n    313
w    187
Name: class, dtype: int64


In [193]:
data = pd.concat([data_train, data_test])
data.index = range(len(data))
print(data.shape)

(4839, 6)


In [194]:
data.head()

Unnamed: 0,class,GLCM_pan,Mean_Green,Mean_Red,Mean_NIR,SD_pan
0,w,120.362774,205.5,119.395349,416.581395,20.676318
1,w,124.739583,202.8,115.333333,354.333333,16.707151
2,w,134.691964,199.285714,116.857143,477.857143,22.496712
3,w,127.946309,178.368421,92.368421,278.473684,14.977453
4,w,135.431548,197.0,112.690476,532.952381,17.604193


In [195]:
data['class'] = data['class'].map({'w': 1, 'n': 0})
data.head(10)

Unnamed: 0,class,GLCM_pan,Mean_Green,Mean_Red,Mean_NIR,SD_pan
0,1,120.362774,205.5,119.395349,416.581395,20.676318
1,1,124.739583,202.8,115.333333,354.333333,16.707151
2,1,134.691964,199.285714,116.857143,477.857143,22.496712
3,1,127.946309,178.368421,92.368421,278.473684,14.977453
4,1,135.431548,197.0,112.690476,532.952381,17.604193
5,1,118.347962,226.15,138.85,608.9,29.072797
6,1,135.436282,184.5,95.142857,309.190476,13.055264
7,1,121.169643,226.0,146.214286,595.571429,22.808542
8,1,131.127161,232.784314,144.588235,563.843137,11.948563
9,1,134.498092,210.212121,116.909091,594.848485,27.937685


In [196]:
data['class'].value_counts()

0    4578
1     261
Name: class, dtype: int64

In [197]:
data['SD_pan'].describe()

count    4839.000000
mean       24.482007
std        10.726997
min         0.000000
25%        17.679972
50%        23.198396
75%        29.326445
max       156.508431
Name: SD_pan, dtype: float64

In [198]:
from sklearn.model_selection import train_test_split

features = ['GLCM_pan', 'Mean_Green', 'Mean_Red', 'Mean_NIR', 'SD_pan']
target = 'class'

X = pd.DataFrame(data, columns = features)
y = pd.DataFrame(data[target])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [199]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()

model.fit(X_train, y_train)
y_predict = model.predict(X_test)

  y = column_or_1d(y, warn=True)


In [200]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    
evaluate_results(y_test, y_predict)

Classification results:
f1: 86.79%
roc: 90.85%
recall: 82.14%
precision: 92.00%


In [201]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data['class'].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]
print(mod_data.shape)

Using 66/261 as positives and unlabeling the rest
(4839, 6)


In [178]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    4773
 1      66
Name: class_test, dtype: int64


In [179]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(66, 7) (66, 7)


In [184]:
model = GradientBoostingClassifier()

X_sample_train = pd.DataFrame(sample_train, columns = features)
y_sample_train = pd.DataFrame(sample_train[target])

model.fit(X_sample_train.values, 
          y_sample_train.values)

X_sample_test = pd.DataFrame(sample_test, columns = features)
y_sample_test = pd.DataFrame(sample_test[target])

y_predict = model.predict(X_sample_test.values)

evaluate_results(y_sample_test.values, y_predict)

Classification results:
f1: 47.03%
roc: 93.38%
recall: 95.88%
precision: 31.16%


  y = column_or_1d(y, warn=True)


In [186]:
from prettytable import PrettyTable

data = [['GradtBoostClass',92.00,82.14,90.85,86.79], ['GradtBoostClass_PU',31.16,95.88,93.38,47.03]]
df = pd.DataFrame(data, columns=['name', 'precision,%', 'recall,%', 'roc_auc,%', 'f_score,%'])

def generate_ascii_table(df):
    x = PrettyTable()
    x.field_names = df.columns.tolist()
    for row in df.values:
        x.add_row(row)
    print(x)
    return x

generate_ascii_table(df)

+--------------------+-------------+----------+-----------+-----------+
|        name        | precision,% | recall,% | roc_auc,% | f_score,% |
+--------------------+-------------+----------+-----------+-----------+
|  GradtBoostClass   |     92.0    |  82.14   |   90.85   |   86.79   |
| GradtBoostClass_PU |    31.16    |  95.88   |   93.38   |   47.03   |
+--------------------+-------------+----------+-----------+-----------+


<prettytable.prettytable.PrettyTable at 0x1d6e4f845c8>

In [202]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:, 0].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.5 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]
print(mod_data.shape)

Using 131/261 as positives and unlabeling the rest
(4839, 6)


In [203]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    4708
 1     131
Name: class_test, dtype: int64


In [204]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(131, 7) (131, 7)


In [205]:
model = GradientBoostingClassifier()

X_sample_train = pd.DataFrame(sample_train, columns = features)
y_sample_train = pd.DataFrame(sample_train[target])

model.fit(X_sample_train.values, 
          y_sample_train.values)

X_sample_test = pd.DataFrame(sample_test, columns = features)
y_sample_test = pd.DataFrame(sample_test[target])

y_predict = model.predict(X_sample_test.values)

evaluate_results(y_sample_test.values, y_predict)

Classification results:
f1: 43.95%
roc: 93.53%
recall: 93.65%
precision: 28.71%


  y = column_or_1d(y, warn=True)


In [206]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:, 0].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.1 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]
print(mod_data.shape)

Using 27/261 as positives and unlabeling the rest
(4839, 6)


In [207]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    4812
 1      27
Name: class_test, dtype: int64


In [208]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(27, 7) (27, 7)


In [209]:
model = GradientBoostingClassifier()

X_sample_train = pd.DataFrame(sample_train, columns = features)
y_sample_train = pd.DataFrame(sample_train[target])

model.fit(X_sample_train.values, 
          y_sample_train.values)

X_sample_test = pd.DataFrame(sample_test, columns = features)
y_sample_test = pd.DataFrame(sample_test[target])

y_predict = model.predict(X_sample_test.values)

evaluate_results(y_sample_test.values, y_predict)

Classification results:
f1: 26.96%
roc: 81.09%
recall: 84.98%
precision: 16.02%


  y = column_or_1d(y, warn=True)
