# Урок 6. Задача look-alike

1. Взять любой набор данных для бинарной классификации (можно скачать один из модельных <a href="https://archive.ics.uci.edu/ml/datasets.php"> отсюда</a>)
2. Обучить любой классификатор (какой вам нравится)
3. Разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные примеры (класс 1), а только лишь часть
4. Применить random negative sampling для построения классификатора в новых условиях
5. Сравнить качество с решением из пункта 3 (построить отчет - таблицу метрик)
6. *Поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [None]:
# !./bash/hw5.sh

In [1]:
import numpy as np
import pandas as pd
import itertools

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

In [2]:
from lib.transformers import *
from lib.reports import PrecisionReport
from lib.pipelines import LookLikePipeline

In [3]:
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
df = pd.read_csv("./data/churn_data.csv")
df.sample(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
1514,1515,15788676,Riley,539,Spain,Male,38,8,71460.67,2,1,1,10074.05,0
1930,1931,15627262,Soto,536,Germany,Male,23,6,92366.72,2,1,0,120661.71,0
2302,2303,15736656,H?,723,France,Female,49,4,0.0,2,0,1,89972.25,0
6483,6484,15621195,Ch'eng,619,Germany,Male,41,3,147974.16,2,1,0,170518.83,0
8930,8931,15665759,Russell,724,France,Female,69,5,117866.92,1,1,1,62280.91,0


In [5]:
df['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Exited']), df['Exited'], random_state=42)

In [7]:
params = {
    "cat_cols" : ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember'],
    "continuos_cols" : ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']
}

In [8]:
rf_pipeline = LookLikePipeline(params=params)
rf_pipeline.fit(X_train, y_train)
rf_pipeline.predict(X_test, y_test)

AttributeError: 'NoneType' object has no attribute 'predict_proba'

In [9]:
type(rf_pipeline.model)

NoneType

In [10]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [11]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
        ('selector', FeatureSelector(column=cat_col)),
        ('ohe', OHEEncoder(key=cat_col))
    ])

    final_transformers.append((cat_col, cat_transformer))

for cont_col in continuous_columns:
    cont_transformer = Pipeline([
        ('selector', NumberSelector(key=cont_col))
    ])

    final_transformers.append((cont_col, cont_transformer))

In [12]:
feats = FeatureUnion(final_transformers)

In [13]:
model = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=42)),
])

model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:, 1]
report_obj = PrecisionReport(y_test, preds)
report = report_obj.report()
report

{'model': None,
 'thresh': 0.36,
 'F-Score': 0.6171548117154813,
 'Precision': 0.6427015250544662,
 'Recall': 0.5935613682092555,
 'ROC AUC': 0.8542005904623949}

In [15]:
mod_data = X_train.copy()
mod_data['Exited'] = y_train

In [16]:
# get the indices of the positives samples
pos_ind = mod_data[mod_data['Exited'] == 1].sample(frac=1, random_state=42).index

# leave just 25% of the positives marked
perc = 0.25
pos_sample_len = int(np.ceil(perc * len(pos_ind)))

print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 385/1540 as positives and unlabeling the rest


In [17]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    7115
 1     385
Name: class_test, dtype: int64


In [18]:
mod_data = mod_data.sample(frac=1, random_state=42)

data_N = mod_data[mod_data['class_test'] == -1]
data_P = mod_data[mod_data['class_test'] == 1]

neg_sample = data_N[:data_P.shape[0]]
pos_sample = data_P.copy()

print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1, random_state=42)

(385, 15) (385, 15)


In [21]:
sample_train.loc[sample_train['class_test'] == -1, 'class_test'] = 0

pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=42)),
])

pipeline.fit(sample_train.drop(columns=['class_test','Exited']),
             sample_train['class_test'])

preds2 = pipeline.predict_proba(X_test)[:, 1]

In [22]:
report_obj2 = PrecisionReport(y_test, preds2)
report2 = report_obj2.report()
report2

{'model': None,
 'thresh': 0.56,
 'F-Score': 0.5560619872379216,
 'Precision': 0.5083333333333333,
 'Recall': 0.613682092555332,
 'ROC AUC': 0.8056978917940997}

In [34]:
for key in report:
    print(key, report[key], report2[key])

model None None
thresh 0.36 0.56
F-Score 0.6171548117154813 0.5560619872379216
Precision 0.6427015250544662 0.5083333333333333
Recall 0.5935613682092555 0.613682092555332
ROC AUC 0.8542005904623949 0.8056978917940997
