<a href="https://colab.research.google.com/github/SovetovAleksey/ML_in_business/blob/6_quest/6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. взять любой набор данных для бинарной классификации (можно скачать один с https://archive.ics.uci.edu/ml/datasets.php)
2. сделать feature engineering
3. обучить любой классификатор (какой вам нравится)
4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
5. применить random negative sampling для построения классификатора в новых условиях
6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
7. поэкспериментировать с долей P на шаге 6 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [None]:
!pip install catboost

In [3]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('adult.data')
data.head(3)

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K


In [5]:
data.rename(columns={data.columns[-1]: 'target'}, inplace=True)
data['target'] = data['target'].replace({' <=50K': 0, ' >50K': 1})
data.head(3)

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,target
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0


In [6]:
for col in data.columns:
    print(f'{col}: {np.sort(data[col].unique())}\n')

39: [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
 90]

 State-gov: [' ?' ' Federal-gov' ' Local-gov' ' Never-worked' ' Private'
 ' Self-emp-inc' ' Self-emp-not-inc' ' State-gov' ' Without-pay']

 77516: [  12285   13769   14878 ... 1366120 1455435 1484705]

 Bachelors: [' 10th' ' 11th' ' 12th' ' 1st-4th' ' 5th-6th' ' 7th-8th' ' 9th'
 ' Assoc-acdm' ' Assoc-voc' ' Bachelors' ' Doctorate' ' HS-grad'
 ' Masters' ' Preschool' ' Prof-school' ' Some-college']

 13: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]

 Never-married: [' Divorced' ' Married-AF-spouse' ' Married-civ-spouse'
 ' Married-spouse-absent' ' Never-married' ' Separated' ' Widowed']

 Adm-clerical: [' ?' ' Adm-clerical' ' Armed-Forces' ' Craft-repair' ' Exec-managerial'
 ' Farming-fishing' ' Handlers-cleaners' ' Machine-op-inspct'
 ' Other-service' ' P

In [7]:
data['target'].value_counts()

0    24719
1     7841
Name: target, dtype: int64

In [8]:
X = data.drop('target', 1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3)

In [9]:
num_features = ['39', ' 77516', ' 13', ' 2174', ' 0', ' 40']
cat_features = [' State-gov', ' Bachelors', ' Never-married', ' Adm-clerical', ' Not-in-family', ' White', ' United-States', ' Male']

In [10]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [11]:
final_transformers = list()

for cat_feature in cat_features:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_feature)),
                ('ohe', OHEEncoder(key=cat_feature))
            ])
    final_transformers.append((cat_feature, cat_transformer))
    
for nun_feature in num_features:
    num_transformer = Pipeline([
                ('selector', NumberSelector(key=nun_feature)),
                ('standard', StandardScaler())
            ])
    final_transformers.append((nun_feature, num_transformer))

In [12]:
feats = FeatureUnion(final_transformers)

pipeline = Pipeline([
        ('features',feats),
        ('classifier', CatBoostClassifier(depth=3, iterations=450, learning_rate=0.11, random_state=1, verbose=False))
        ])

In [13]:
metrics = pd.DataFrame(columns=['Proportion', 'Best Threshold', 'F-Score', 'Precision', 'Recall'])

pipeline.fit(X_train, y_train)
    
preds = pipeline.predict_proba(X_test)[:, 1]
    
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
cnf_matrix = confusion_matrix(y_test, preds>thresholds[ix])
    
print('Best Threshold=%.3f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix],
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))
metrics = metrics.append({'Proportion': 'no RNS',
                          'Best Threshold': thresholds[ix],
                          'F-Score': fscore[ix],
                          'Precision': precision[ix],
                          'Recall': recall[ix]},
                         ignore_index=True)

Best Threshold=0.400, F-Score=0.733, Precision=0.726, Recall=0.739


In [14]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)

props = np.linspace(0.1, 0.9, 5)
for prop in props:
    pos_sample_len = int(np.ceil(prop * len(pos_ind)))
    pos_sample = pos_ind[:pos_sample_len]

    mod_data['class_test'] = -1
    mod_data.loc[pos_sample,'class_test'] = 1

    x_data = mod_data.iloc[:,:-2].values # just the X 
    y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
    y_positive = mod_data.iloc[:,-2].values # original class

    mod_data = mod_data.sample(frac=1)
    neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
    sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
    pos_sample = mod_data[mod_data['class_test']==1]

    sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

    X_train = sample_train.iloc[:,:-2]
    y_train = sample_train.iloc[:,-2]

    X_test = sample_test.iloc[:,:-2]
    y_test = sample_test.iloc[:,-2]

    pipeline.fit(X_train, y_train)

    preds = pipeline.predict_proba(X_test)[:, 1]

    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    cnf_matrix = confusion_matrix(y_test, preds>thresholds[ix])

    print('Proportion= %f, Best Threshold=%.3f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (prop,
                                                                                              thresholds[ix],
                                                                                              fscore[ix],
                                                                                              precision[ix],
                                                                                              recall[ix]))
    metrics = metrics.append({'Proportion': prop,
                              'Best Threshold': thresholds[ix],
                              'F-Score': fscore[ix],
                              'Precision': precision[ix],
                              'Recall': recall[ix]},
                             ignore_index=True)

Proportion= 0.100000, Best Threshold=0.745, F-Score=0.674, Precision=0.598, Recall=0.771
Proportion= 0.300000, Best Threshold=0.772, F-Score=0.667, Precision=0.630, Recall=0.709
Proportion= 0.500000, Best Threshold=0.796, F-Score=0.623, Precision=0.587, Recall=0.664
Proportion= 0.700000, Best Threshold=0.805, F-Score=0.561, Precision=0.519, Recall=0.611
Proportion= 0.900000, Best Threshold=0.922, F-Score=0.452, Precision=0.579, Recall=0.371


In [15]:
metrics

Unnamed: 0,Proportion,Best Threshold,F-Score,Precision,Recall
0,no RNS,0.399902,0.732673,0.726096,0.739371
1,0.1,0.745438,0.673526,0.597813,0.7712
2,0.3,0.771823,0.667286,0.630461,0.708679
3,0.5,0.796485,0.623359,0.587087,0.664408
4,0.7,0.804723,0.561369,0.519457,0.610638
5,0.9,0.921898,0.451754,0.578652,0.370504
