<a href="https://colab.research.google.com/github/OlegV12/ML_in_business/blob/Lesson_6/HW6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Домашнее задание
1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
2. сделать feature engineering
3. обучить любой классификатор (какой вам нравится)
4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
5. применить random negative sampling для построения классификатора в новых условиях
6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
7. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [456]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score


In [457]:
def evaluate_results(y_test, y_predict):
    f1 = f1_score(y_test, y_predict)
    roc = roc_auc_score(y_test, y_predict)
    rec = recall_score(y_test, y_predict, average='binary')
    prc = precision_score(y_test, y_predict, average='binary')
    
    return f1, roc, rec, prc

In [458]:
results = {
    'method': [],
    'f1': [],
    'roc': [],
    'rec': [],
    'prc': [],
}

In [459]:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [460]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['target'], axis=1), df['target'], random_state=42)

In [461]:
tree = DecisionTreeClassifier(max_depth=3, random_state=42)

In [462]:
tree.fit(X_train, y=y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [463]:
preds = tree.predict(X_test)

In [464]:
f1, roc, rec, prc = evaluate_results(y_test, preds)

In [465]:
results['method'].append('Decision Tree')
results['f1'].append(f1)
results['roc'].append(roc)
results['rec'].append(rec)
results['prc'].append(prc)


# Positive Unlabeled

In [466]:
pu_df = df.copy()

In [467]:
# label 50% of 1st class as positive
pos_samples = pu_df.loc[df['target'] == 1].sample(frac=0.5, random_state=5)
pu_df['label'] = -1
pu_df.loc[pos_samples.index, 'label'] = 1

In [468]:
# get random negative sampling (150 samples marked as negative)

neg_sample = pu_df[pu_df['label']==-1].sample(150, random_state=5)
sample_train = pd.concat([neg_sample, pos_samples]).sample(frac=1)
print(neg_sample.shape, pos_samples.shape, sample_test.shape)


(150, 32) (178, 31) (50, 32)


In [469]:
neg_inx = pu_df.index.isin([neg_sample.index])

In [470]:
sample_df = pu_df.copy()
sample_df.drop(neg_sample.index, inplace=True)
sample_df.drop(pos_samples.index, inplace=True)
sample_test = sample_df[sample_df['label']==-1].sample(50, random_state=5)

In [471]:
# fit/predict Decision Tree
model = DecisionTreeClassifier(max_depth=3, random_state=42)

model.fit(sample_train.iloc[:,:-2], 
          sample_train.iloc[:,-2])
y_predict = model.predict(sample_test.iloc[:,:-2])
f1, roc, rec, prc = evaluate_results(sample_test.iloc[:,-2], y_predict)

In [472]:
results['method'].append('P_U Decision Tree')
results['f1'].append(f1)
results['roc'].append(roc)
results['rec'].append(rec)
results['prc'].append(prc)

In [473]:
pd.DataFrame(results)

Unnamed: 0,method,f1,roc,rec,prc
0,Decision Tree,0.966667,0.951727,0.977528,0.956044
1,P_U Decision Tree,0.857143,0.87931,1.0,0.75


Получили некоторое улучшение по recall, по сравнению с обычным решением.

In [474]:
pu_df = df.copy()


In [475]:
for i in np.arange(0.1, 1, 0.1):
      
    pos_samples = pu_df.loc[df['target'] == 1].sample(frac=i, random_state=5)
    pu_df['label'] = -1
    pu_df.loc[pos_samples.index, 'label'] = 1
    neg_sample = pu_df[pu_df['label']==-1].sample(150, random_state=5)
    sample_df = pu_df.copy()
    sample_df.drop(neg_sample.index, inplace=True)
    sample_df.drop(pos_samples.index, inplace=True)
    sample_test = sample_df[sample_df['label']==-1].sample(50, random_state=5)
    sample_train = pd.concat([neg_sample, pos_samples]).sample(frac=1)
    model = DecisionTreeClassifier(max_depth=3, random_state=42)

    model.fit(sample_train.iloc[:,:-2], 
              sample_train.iloc[:,-2])
    y_predict = model.predict(sample_test.iloc[:,:-2])
    f1, roc, rec, prc = evaluate_results(sample_test.iloc[:,-2], y_predict)
    results['method'].append(f'{np.round(i, 1)}% P_U Decision Tree')
    results['f1'].append(f1)
    results['roc'].append(roc)
    results['rec'].append(rec)
    results['prc'].append(prc)

In [476]:
pd.DataFrame(results)

Unnamed: 0,method,f1,roc,rec,prc
0,Decision Tree,0.966667,0.951727,0.977528,0.956044
1,P_U Decision Tree,0.857143,0.87931,1.0,0.75
2,0.1% P_U Decision Tree,0.955224,0.939338,0.941176,0.969697
3,0.2% P_U Decision Tree,0.877193,0.86,1.0,0.78125
4,0.3% P_U Decision Tree,0.909091,0.897436,0.961538,0.862069
5,0.4% P_U Decision Tree,0.863636,0.891667,0.95,0.791667
6,0.5% P_U Decision Tree,0.857143,0.87931,1.0,0.75
7,0.6% P_U Decision Tree,0.842105,0.911765,1.0,0.727273
8,0.7% P_U Decision Tree,0.914286,0.940285,0.941176,0.888889
9,0.8% P_U Decision Tree,0.774194,0.907895,1.0,0.631579


Наилучшие показатели получены при доле P 10% и 70%

Думаю датасет не совсем подходит для решения через Random Negative Sampling из-за болшого количества наблюдений первого класса. 