In [2]:
import numpy as np
import pandas as pd

import catboost as ctb

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

In [3]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    """
    Обучить и оценить модель.
    """
    model = ctb.CatBoostClassifier(cat_features=cat_feats)
    model.fit(X_train, y_train, verbose=False)
    y_pred = model.predict(X_test)
    
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='binary')
    rec = recall_score(y_test, y_pred, average='binary')
    
    return {'f1': [f1], 'roc-auc': [roc], 'precision': [prec], 'recall': [rec]}

In [4]:
# Домашнее задание.
# 1. взять любой набор данных для бинарной классификации (можно скачать один из модельных 
# с https://archive.ics.uci.edu/ml/datasets.php)

In [5]:
df = pd.read_csv("aug_train.csv")
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [6]:
df.shape

(19158, 14)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [8]:
# Ненужный признак
df = df.drop(columns=['enrollee_id'])

In [9]:
# Конвертируем таргет в int
df['target'] = df['target'].astype(int)

In [10]:
# Смотрим баланс таргета
df['target'].value_counts()

0    14381
1     4777
Name: target, dtype: int64

In [11]:
# 2. Сделать feature engineering.

In [12]:
# Просто заменяем все на самое частое значение (моду), т.к. все признаки с пропущенными значениями категориальные.

In [13]:
for col in df.select_dtypes('object').columns:
    df[col] = df[col].fillna(df[col].value_counts().index[0])

In [14]:
# Делим данные на трейн и тест.
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], test_size=0.2, random_state=42)

In [15]:
# 3. Обучить любой классификатор (какой вам нравится).

In [16]:
cat_feats = ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level',
             'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job']

In [17]:
metrics = pd.DataFrame(evaluate_model(ctb.CatBoostClassifier(cat_features=cat_feats),
                                      X_train,
                                      y_train,
                                      X_test,
                                      y_test))

In [18]:
metrics

Unnamed: 0,f1,roc-auc,precision,recall
0,0.499113,0.666604,0.571042,0.443277


In [19]:
# 4. Далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные
# (класс 1) примеры, а только лишь часть.

In [20]:
def create_unlabeled(df, pos_frac=0.2):
    """
    Сэмплирует долю pos_frac наблюдений класса 1 как positive, остальные как unlabeled.
    """
    sdf = df.copy()
    pos_mask = (df['target'] == 1)
    pos_ind = df[pos_mask].sample(frac=pos_frac).index
    unlab_ind = df[~df.index.isin(pos_ind)].index
    
    # Помечаем данные признаком is_labeled - Positive = 1, Unlabeled = 0
    df.loc[pos_ind, 'is_labeled'] = 1
    df.loc[unlab_ind, 'is_labeled'] = 0
    df['is_labeled'] = df['is_labeled'].astype(int)
    return df

In [21]:
# Возьмем 20% наблюдений положительного класса как positive, остальные возьмем как unlabeled.
rns_df = create_unlabeled(df, pos_frac=0.2)
rns_df.head(3)

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target,is_labeled
0,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,1,36,1,1
1,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0,0
2,city_21,0.624,Male,No relevent experience,Full time course,Graduate,STEM,5,50-99,Pvt Ltd,never,83,0,0


In [22]:
# 5. Применить random negative sampling для построения классификатора в новых условиях.

In [23]:
def get_rns_samples(rns_df):
    """
    Создает тренировочную и тестовую выборки для RNS на основе признака is_labeled.
    """
    rns_df = rns_df.sample(frac=1)

    pos_sample = rns_df[rns_df['is_labeled'] == 1]
    neg_sample = rns_df[rns_df['is_labeled'] == 0][:pos_sample.shape[0]]
    train_samples = pd.concat([neg_sample, pos_sample]).sample(frac=1)
    test_samples = rns_df[rns_df['is_labeled'] == 0][pos_sample.shape[0]:]
    
    return train_samples, test_samples

In [24]:
train_samples, test_samples = get_rns_samples(rns_df)

In [25]:
metrics_task5 = evaluate_model(ctb.CatBoostClassifier(cat_features=cat_feats),
                             train_samples.iloc[:, :-2],
                             train_samples['is_labeled'],
                             test_samples.iloc[:, :-2],
                             test_samples['target'])

In [26]:
metrics = metrics.append(pd.DataFrame(metrics_task5))

In [27]:
# 6. Сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик).

In [28]:
metrics.index = ['normal', 'RNS']

In [29]:
metrics

Unnamed: 0,f1,roc-auc,precision,recall
normal,0.499113,0.666604,0.571042,0.443277
RNS,0.518877,0.722968,0.395975,0.752407


In [30]:
# RNS справился даже немного лучше, чем обычная модель. Интересно то, что повысился recall и понизилась precision -
# модели стало сложнее различать между классами, поэтому она начала относить больше наблюдений к положительному классу.

In [31]:
# 7. Поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P).

In [32]:
rns_metrics = pd.DataFrame(columns=['f1', 'roc-auc', 'precision', 'recall'])

fracs = np.linspace(0.1, 0.9, 9)
for frac in fracs:
    train_samples, test_samples = get_rns_samples(create_unlabeled(df, pos_frac=frac))
    frac_metrics = evaluate_model(ctb.CatBoostClassifier(cat_features=cat_feats),
                                 train_samples.iloc[:, :-2],
                                 train_samples['is_labeled'],
                                 test_samples.iloc[:, :-2],
                                 test_samples['target'])
    rns_metrics = rns_metrics.append(pd.DataFrame(frac_metrics))

In [33]:
rns_metrics.index = fracs

In [34]:
rns_metrics

Unnamed: 0,f1,roc-auc,precision,recall
0.1,0.546467,0.721714,0.443473,0.711772
0.2,0.533992,0.733324,0.41614,0.744968
0.3,0.501519,0.727131,0.383213,0.725497
0.4,0.489155,0.737998,0.370266,0.720502
0.5,0.460386,0.750974,0.331536,0.753062
0.6,0.396987,0.736703,0.272946,0.727681
0.7,0.349065,0.746277,0.228755,0.736323
0.8,0.269012,0.754565,0.163112,0.766949
0.9,0.164005,0.751142,0.09191,0.760684
