Описание данных - https://www.kaggle.com/datasets/yasserh/breast-cancer-dataset

### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score


In [2]:
df = pd.read_csv("breast-cancer.csv")
df.sample(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
367,9011495,B,12.21,18.02,78.31,458.4,0.09231,0.07175,0.04392,0.02027,...,14.29,24.04,93.85,624.6,0.1368,0.217,0.2413,0.08829,0.3218,0.0747
502,91505,B,12.54,16.32,81.25,476.3,0.1158,0.1085,0.05928,0.03279,...,13.57,21.4,86.67,552.0,0.158,0.1751,0.1889,0.08411,0.3155,0.07538
235,88249602,B,14.03,21.25,89.79,603.4,0.0907,0.06945,0.01462,0.01896,...,15.33,30.28,98.27,715.5,0.1287,0.1513,0.06231,0.07963,0.2226,0.07617
414,905680,M,15.13,29.81,96.71,719.5,0.0832,0.04605,0.04686,0.02739,...,17.26,36.91,110.1,931.4,0.1148,0.09866,0.1547,0.06575,0.3233,0.06165
85,8612399,M,18.46,18.52,121.1,1075.0,0.09874,0.1053,0.1335,0.08795,...,22.93,27.68,152.2,1603.0,0.1398,0.2089,0.3157,0.1642,0.3695,0.08579


In [3]:
df.drop(columns="id", inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
radius_mean,569.0,14.127292,3.524049,6.981,11.7,13.37,15.78,28.11
texture_mean,569.0,19.289649,4.301036,9.71,16.17,18.84,21.8,39.28
perimeter_mean,569.0,91.969033,24.298981,43.79,75.17,86.24,104.1,188.5
area_mean,569.0,654.889104,351.914129,143.5,420.3,551.1,782.7,2501.0
smoothness_mean,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634
compactness_mean,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454
concavity_mean,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268
concave points_mean,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012
symmetry_mean,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304
fractal_dimension_mean,569.0,0.062798,0.00706,0.04996,0.0577,0.06154,0.06612,0.09744


In [5]:
df.replace({"M": 1, "B": 0}, inplace=True)
df.diagnosis

0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: diagnosis, Length: 569, dtype: int64

In [6]:
df.diagnosis.value_counts()

0    357
1    212
Name: diagnosis, dtype: int64

In [7]:
df.nunique()

diagnosis                    2
radius_mean                456
texture_mean               479
perimeter_mean             522
area_mean                  539
smoothness_mean            474
compactness_mean           537
concavity_mean             537
concave points_mean        542
symmetry_mean              432
fractal_dimension_mean     499
radius_se                  540
texture_se                 519
perimeter_se               533
area_se                    528
smoothness_se              547
compactness_se             541
concavity_se               533
concave points_se          507
symmetry_se                498
fractal_dimension_se       545
radius_worst               457
texture_worst              511
perimeter_worst            514
area_worst                 544
smoothness_worst           411
compactness_worst          529
concavity_worst            539
concave points_worst       492
symmetry_worst             500
fractal_dimension_worst    535
dtype: int64

Все признаки кроме diagnosis являются непрерывными.

In [8]:
x_data = df.drop(columns="diagnosis")
y_data = df["diagnosis"]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [9]:
model = RandomForestClassifier()

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

In [10]:
def evaluate_results(y_test, y_predict):
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    
evaluate_results(y_test, y_predict)

f1: 98.73%
roc: 98.75%
recall: 97.50%
precision: 100.00%


### Теперь очередь за PU learning

In [11]:
mod_data = df.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,0].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 53/212 as positives and unlabeling the rest


In [12]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    516
 1     53
Name: class_test, dtype: int64


In [13]:
x_data = mod_data.iloc[:,1:-1].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,0].values # original class

In [14]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(53, 32) (53, 32)


In [15]:
model = RandomForestClassifier()

model.fit(sample_train.iloc[:,1:-1].values, 
          sample_train.iloc[:,0].values)
y_predict = model.predict(sample_test.iloc[:,1:-1].values)
evaluate_results(sample_test.iloc[:,0].values, y_predict)

f1: 90.97%
roc: 94.07%
recall: 93.79%
precision: 88.31%


In [28]:
def get_results():
    neg_samples_num = [0.15, 0.25, 0.5, 0.75]
    print('Classification results:\n')
    for el in neg_samples_num:
        print(f"Random negative sampling: {el*100}%\n")
        mod_data = df.copy()
        #get the indices of the positives samples
        pos_ind = np.where(mod_data.iloc[:,0].values == 1)[0]
        #shuffle them
        np.random.shuffle(pos_ind)

        pos_sample_len = int(np.ceil(el* len(pos_ind)))
        pos_sample = pos_ind[:pos_sample_len]

        mod_data['class_test'] = -1
        mod_data.loc[pos_sample,'class_test'] = 1

        x_data = mod_data.iloc[:,1:-1].values # just the X 
        y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
        y_positive = mod_data.iloc[:,0].values # original class

        mod_data = mod_data.sample(frac=1)
        neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
        sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
        pos_sample = mod_data[mod_data['class_test']==1]

        sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

        model = RandomForestClassifier()

        model.fit(sample_train.iloc[:,1:-1].values, 
                  sample_train.iloc[:,0].values)
        y_predict = model.predict(sample_test.iloc[:,1:-1].values)
        
        evaluate_results(sample_test.iloc[:,0].values, y_predict)
        print("\n")
get_results()

Classification results:

Random negative sampling: 15.0%

f1: 93.29%
roc: 94.99%
recall: 93.57%
precision: 93.02%


Random negative sampling: 25.0%

f1: 89.87%
roc: 94.41%
recall: 97.93%
precision: 83.04%


Random negative sampling: 50.0%

f1: 77.78%
roc: 90.72%
recall: 96.25%
precision: 65.25%


Random negative sampling: 75.0%

f1: 79.01%
roc: 92.47%
recall: 91.43%
precision: 69.57%


