## Задача look-alike

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [2]:
import numpy as np
import pandas as pd

Использовал данный датасет - https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

Цель классификации - предсказать, будет ли клиент подписываться (да/нет) на срочный депозит (переменная y).

In [3]:
df = pd.read_csv('bank-full.csv', delimiter=';')
df.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


#### Сделать feature engineering

In [4]:
df.shape

(45211, 17)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [6]:
# Изменение целевой переменной
df['y'] = df['y'].map({'no': 0, 'yes': 1})

In [7]:
df['y'].unique()

array([0, 1], dtype=int64)

In [8]:
df.select_dtypes(include=['object']).head(10)

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome
0,management,married,tertiary,no,yes,no,unknown,may,unknown
1,technician,single,secondary,no,yes,no,unknown,may,unknown
2,entrepreneur,married,secondary,no,yes,yes,unknown,may,unknown
3,blue-collar,married,unknown,no,yes,no,unknown,may,unknown
4,unknown,single,unknown,no,no,no,unknown,may,unknown
5,management,married,tertiary,no,yes,no,unknown,may,unknown
6,management,single,tertiary,no,yes,yes,unknown,may,unknown
7,entrepreneur,divorced,tertiary,yes,yes,no,unknown,may,unknown
8,retired,married,primary,no,yes,no,unknown,may,unknown
9,technician,single,secondary,no,yes,no,unknown,may,unknown


In [9]:
df["month"].value_counts()

may    13766
jul     6895
aug     6247
jun     5341
nov     3970
apr     2932
feb     2649
jan     1403
oct      738
sep      579
mar      477
dec      214
Name: month, dtype: int64

In [10]:
cleanup_nums = {"month": {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, 
                          "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12
                         }
               }

In [11]:
df = df.replace(cleanup_nums)

In [12]:
df[['marital']] = df[['marital']].replace({'married': 1, 'single': 0, 'divorced': 0})
df[['education']] = df[['education']].replace({'tertiary': 3, 'secondary': 2, 'primary': 1, 'unknown': 1})
df[['default']] = df[['default']].replace({'yes': 1, 'no': 0})
df[['housing']] = df[['housing']].replace({'yes': 1, 'no': 0})
df[['loan']] = df[['loan']].replace({'yes': 1, 'no': 0})
df[['contact']] = df[['contact']].replace({'cellular': 2, 'telephone': 1, 'unknown': 0})
df[['poutcome']] = df[['poutcome']].replace({'failure': 2, 'success': 1, 'other': 0, 'unknown': 0})

In [13]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,1,3,0,2143,1,0,0,5,5,261,1,-1,0,0,0
1,44,technician,0,2,0,29,1,0,0,5,5,151,1,-1,0,0,0
2,33,entrepreneur,1,2,0,2,1,1,0,5,5,76,1,-1,0,0,0
3,47,blue-collar,1,1,0,1506,1,0,0,5,5,92,1,-1,0,0,0
4,33,unknown,0,1,0,1,0,0,0,5,5,198,1,-1,0,0,0


In [14]:
df["job"] = df["job"].astype('category')

In [15]:
df.dtypes

age             int64
job          category
marital         int64
education       int64
default         int64
balance         int64
housing         int64
loan            int64
contact         int64
day             int64
month           int64
duration        int64
campaign        int64
pdays           int64
previous        int64
poutcome        int64
y               int64
dtype: object

In [16]:
df["job"] = df["job"].cat.codes

In [17]:
df.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,3,0,2143,1,0,0,5,5,261,1,-1,0,0,0
1,44,9,0,2,0,29,1,0,0,5,5,151,1,-1,0,0,0
2,33,2,1,2,0,2,1,1,0,5,5,76,1,-1,0,0,0
3,47,1,1,1,0,1506,1,0,0,5,5,92,1,-1,0,0,0
4,33,11,0,1,0,1,0,0,0,5,5,198,1,-1,0,0,0


In [18]:
df["job"] = df["job"].astype('int64')

#### Обучить любой классификатор (какой вам нравится)

In [19]:
from sklearn.model_selection import train_test_split

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [20]:
import xgboost as xgb

In [21]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)



In [22]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    
    f1 = f1_score(y_test, y_predict)
    roc = roc_auc_score(y_test, y_predict)
    precision = precision_score(y_test, y_predict, average='binary')
    recall = recall_score(y_test, y_predict, average='binary')
    return f1, precision, recall, roc

In [23]:
evaluate_results(y_test, y_predict)

(0.5291181364392679,
 0.6060991105463787,
 0.46948818897637795,
 0.7154342651621644)

#### Разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть

In [24]:
mod_data = df.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 10% of the positives marked
pos_sample_len = int(np.ceil(0.1 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

#Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())


x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

Using 529/5289 as positives and unlabeling the rest
target variable:
 -1    44682
 1      529
Name: class_test, dtype: int64


In [25]:
x_data = mod_data.drop(['y', 'class_test'], axis=1).values  # just the X
y_labeled = mod_data.loc[:, 'class_test'].values  # new class (just the P & U)
y_positive = mod_data.loc[:, 'y'].values  # original class

#### Применить random negative sampling для построения классификатора в новых условиях

In [26]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(529, 18) (529, 18)


In [27]:
model_rns = xgb.XGBClassifier()
model.fit(sample_train.drop(['y', 'class_test'], axis=1).values,
          sample_train.loc[:, 'y'].values)
y_predict_rns = model.predict(sample_test.drop(['y', 'class_test'], axis=1).values)

evaluate_results(sample_test.loc[:, 'y'].values, y_predict_rns)



(0.4820926966292135, 0.332687181972377, 0.8750796685787126, 0.8328143868422133)

In [29]:
pd.DataFrame([
    evaluate_results(y_test, y_predict),
    evaluate_results(sample_test.loc[:, 'y'].values, y_predict_rns)
], columns=['f1', 'precision', 'recall', 'roc'], index=['Simple xgboost', 'random negative sampling']).round(2)

Unnamed: 0,f1,precision,recall,roc
Simple xgboost,0.53,0.61,0.47,0.72
random negative sampling,0.48,0.33,0.88,0.83


<b>Бонусный вопрос:</b>

Как вы думаете, какой из методов на практике является более предпочтительным: random negative sampling или 2-step approach?

Я думаю, что метод __random negative sampling__  на практике используют больше.