1.Взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
2.Обучить любой классификатор (какой вам нравится)
3.Разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные примеры (класс 1), а только лишь часть
4.Применить random negative sampling для построения классификатора в новых условиях
5.Сравнить качество с решением из пункта 3 (построить отчет - таблицу метрик)
6.*Поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
%matplotlib inline
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pylab as plt
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score, precision_recall_curve
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion

import warnings
warnings.filterwarnings('ignore')

In [22]:
df = pd.read_csv("Churn_Modelling.csv")
df.head(35)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [24]:
df['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

Разделим на тест\трейн

In [25]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Exited', axis = 1), df['Exited'], test_size=0.2, random_state=15)

Напишем пайплайн для обработки

In [26]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    

class NumberSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key, drop_first=True).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key, drop_first=True)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

Разделим категориальные и числовые признаки

In [27]:
continuous_columns = X_train.select_dtypes(include='number').columns.to_list()
categorical_columns = X_train.select_dtypes(exclude='number').columns.to_list()

In [28]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                
            ])
    
    final_transformers.append((cont_col, cont_transformer))

In [29]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

Будем использовать xgboost

In [30]:
model = xgb.XGBClassifier(random_state=15)

Соберем конечный пайплайн

In [31]:
pipeline = Pipeline([
    ('features', feats),
    ('classifier', model)
])

Обучим нашу изначальную модель

In [32]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Surname',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Surname')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Surname'))])),
                                                ('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
        

In [33]:
y_predict = pipeline.predict(X_test)

Результаты соберем в одну таблицу

In [34]:
results = {'model' : [], 'f1' : [], 'recall' : [], 'precision' : [] }

In [35]:
results['model'].append('commonXGB')
results['f1'].append(f1_score(y_test, y_predict))
results['recall'].append(recall_score(y_test, y_predict, average='binary'))
results['precision'].append(precision_score(y_test, y_predict, average='binary'))

Теперь будем использовать RNS, для сравнения сделаем несколько итераций и каждый раз будем брать разное количество размеченных данных

In [36]:
samples = np.linspace(0.1, 1, 10)

In [37]:
for i in samples:
    mod_data = X_train.copy()
    mod_data['label'] = y_train
    mod_data = mod_data.reset_index(drop=True)


    pos_ind = np.where(mod_data.iloc[:, -1].values == 1)[0]

    # shuffle them
    np.random.shuffle(pos_ind)
    
    perc = i
    pos_sample_len = int(np.ceil(perc * len(pos_ind)))

    
    pos_sample = pos_ind[:pos_sample_len]
    mod_data['class_test'] = -1
    mod_data.loc[pos_sample,'class_test'] = 1
   
    mod_data = mod_data.sample(frac=1)


    data_N = mod_data[mod_data['class_test'] == -1]
    data_P = mod_data[mod_data['class_test'] == 1]

    neg_sample = data_N[:data_P.shape[0]]
    sample_test = data_N[data_P.shape[0]:]
    pos_sample = data_P.copy()

    
    sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

    sample_train.loc[sample_train['class_test'] == -1, 'class_test'] = 0
    X_sample_train = sample_train.drop(columns=['class_test', 'label'])
    y_sample_train = sample_train['class_test'] 

    pipeline.fit(X_sample_train, y_sample_train)

    y_predict = pipeline.predict(X_test)

    results['model'].append(f'commonXGB+RNS_{i:.1f}sample')
    results['f1'].append(f1_score(y_test, y_predict))
    results['recall'].append(recall_score(y_test, y_predict, average='binary'))
    results['precision'].append(precision_score(y_test, y_predict, average='binary'))

Итоговые метрики

In [38]:
pd.DataFrame(results)

Unnamed: 0,model,f1,recall,precision
0,commonXGB,0.545185,0.455446,0.678967
1,commonXGB+RNS_0.1sample,0.462489,0.648515,0.359396
2,commonXGB+RNS_0.2sample,0.490251,0.653465,0.392273
3,commonXGB+RNS_0.3sample,0.504314,0.65099,0.411581
4,commonXGB+RNS_0.4sample,0.535104,0.69802,0.433846
5,commonXGB+RNS_0.5sample,0.520776,0.69802,0.415317
6,commonXGB+RNS_0.6sample,0.526417,0.678218,0.430141
7,commonXGB+RNS_0.7sample,0.55706,0.712871,0.457143
8,commonXGB+RNS_0.8sample,0.565476,0.705446,0.471854
9,commonXGB+RNS_0.9sample,0.54333,0.690594,0.447833
