### Урок 6. Задача lookalike (Positive Unlabeled Learning)

https://www.kaggle.com/sajidsaifi/prostate-cancer

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("Prostate_Cancer.csv")

In [2]:
df.head()

Unnamed: 0,id,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,1,M,23,12,151,954,0.143,0.278,0.242,0.079
1,2,B,9,13,133,1326,0.143,0.079,0.181,0.057
2,3,M,21,27,130,1203,0.125,0.16,0.207,0.06
3,4,M,14,16,78,386,0.07,0.284,0.26,0.097
4,5,M,9,19,135,1297,0.141,0.133,0.181,0.059


In [3]:
from sklearn.preprocessing import LabelEncoder
label_encode = LabelEncoder()
labels = label_encode.fit_transform(df['diagnosis_result'])
df['target'] = labels
df.drop(columns=['id','diagnosis_result'], axis=1, inplace=True)

In [4]:
df.describe()

Unnamed: 0,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension,target
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,16.85,18.23,96.78,702.88,0.10273,0.1267,0.19317,0.06469,0.62
std,4.879094,5.192954,23.676089,319.710895,0.014642,0.061144,0.030785,0.008151,0.487832
min,9.0,11.0,52.0,202.0,0.07,0.038,0.135,0.053,0.0
25%,12.0,14.0,82.5,476.75,0.0935,0.0805,0.172,0.059,0.0
50%,17.0,17.5,94.0,644.0,0.102,0.1185,0.19,0.063,1.0
75%,21.0,22.25,114.25,917.0,0.112,0.157,0.209,0.069,1.0
max,25.0,27.0,172.0,1878.0,0.143,0.345,0.304,0.097,1.0


In [5]:
from sklearn.model_selection import train_test_split

x_data = df.iloc[:,:-1]
y_data = df.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [6]:
import xgboost as xgb

model = xgb.XGBClassifier()

model.fit(x_train, y_train)
y_predict = model.predict(x_test)



In [7]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    
evaluate_results(y_test, y_predict)

Classification results:
f1: 83.33%
roc: 78.79%
recall: 90.91%
precision: 76.92%


### PU learning

In [8]:
mod_df = df.copy()
pos_ind = np.where(mod_df.iloc[:,-1].values == 1)[0]
np.random.shuffle(pos_ind)
pos_sample_len = int(np.ceil(0.20 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 13/62 as positives and unlabeling the rest


In [9]:
mod_df['class_test'] = -1
mod_df.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_df.iloc[:,-1].value_counts())

target variable:
 -1    87
 1    13
Name: class_test, dtype: int64


In [10]:
mod_df.head(10)

Unnamed: 0,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension,target,class_test
0,23,12,151,954,0.143,0.278,0.242,0.079,1,-1
1,9,13,133,1326,0.143,0.079,0.181,0.057,0,-1
2,21,27,130,1203,0.125,0.16,0.207,0.06,1,-1
3,14,16,78,386,0.07,0.284,0.26,0.097,1,1
4,9,19,135,1297,0.141,0.133,0.181,0.059,1,-1
5,25,25,83,477,0.128,0.17,0.209,0.076,0,-1
6,16,26,120,1040,0.095,0.109,0.179,0.057,1,-1
7,15,18,90,578,0.119,0.165,0.22,0.075,1,1
8,19,24,88,520,0.127,0.193,0.235,0.074,1,-1
9,25,11,84,476,0.119,0.24,0.203,0.082,1,-1


In [11]:
x_data = mod_df.iloc[:,:-2].values
y_labeled = mod_df.iloc[:,-1].values 
y_positive = mod_df.iloc[:,-2].values

In [12]:
mod_df = mod_df.sample(frac=1)
neg_sample = mod_df[mod_df['class_test']==-1][:len(mod_df[mod_df['class_test']==1])]
sample_test = mod_df[mod_df['class_test']==-1][len(mod_df[mod_df['class_test']==1]):]
pos_sample = mod_df[mod_df['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(13, 10) (13, 10)


In [13]:
model = xgb.XGBClassifier()

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 89.89%
roc: 86.66%
recall: 97.56%
precision: 83.33%
