### Задание

1. Взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
2. Обучить любой классификатор (какой вам нравится)
3. Разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные примеры (класс 1), а только лишь часть
4. Применить random negative sampling для построения классификатора в новых условиях
5. Сравнить качество с решением из пункта 3 (построить отчет - таблицу метрик)
 6*. Поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pylab as plt
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score, precision_recall_curve
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Student_Behaviour.csv')
df.head(5)

Unnamed: 0,"Have you completed any certification courses, or are you currently enrolled in any?",Gender,Department,Height(CM),Weight(KG),10th Mark,12th Mark,college mark,hobbies,daily studing time,prefer to study in,salary expectation,Do you like your degree?,possibility of choosing their career based on their degree :,social medai & video games spending Time,Travelling Time,Stress Level,Financial Status,Are you doing a part-time job right now?
0,No,Male,BCA,100.0,58.0,79.0,65.0,80.0,Video Games,0 - 30 minute,Morning,40000,No,50%,1.30 - 2 hour,30 - 60 minutes,Bad,Bad,No
1,No,Female,BCA,90.0,40.0,70.0,80.0,70.0,Cinema,30 - 60 minute,Morning,15000,Yes,75%,1 - 1.30 hour,0 - 30 minutes,Bad,Bad,No
2,Yes,Male,BCA,159.0,78.0,69.5,61.0,55.0,Cinema,1 - 2 Hour,Anytime,13000,Yes,50%,More than 2 hour,30 - 60 minutes,Awful,Bad,No
3,Yes,Female,BCA,147.0,20.0,70.0,59.0,58.0,Reading books,1 - 2 Hour,Anytime,1500000,No,50%,1.30 - 2 hour,0 - 30 minutes,Bad,good,No
4,No,Male,BCA,170.0,54.0,40.0,65.0,30.0,Video Games,30 - 60 minute,Morning,50000,Yes,25%,1.30 - 2 hour,30 - 60 minutes,Good,good,No


Заменим значения "Yes" и "No" на бинарные значения 1 и 0 соответственно

In [3]:
for col in ['Have you completed any certification courses, or are you currently enrolled in any?', 'Do you like your degree?', 'Are you doing a part-time job right now?']:
    df[col] = df[col].map({'Yes': 1, 'No': 0})
df.head()

Unnamed: 0,"Have you completed any certification courses, or are you currently enrolled in any?",Gender,Department,Height(CM),Weight(KG),10th Mark,12th Mark,college mark,hobbies,daily studing time,prefer to study in,salary expectation,Do you like your degree?,possibility of choosing their career based on their degree :,social medai & video games spending Time,Travelling Time,Stress Level,Financial Status,Are you doing a part-time job right now?
0,0,Male,BCA,100.0,58.0,79.0,65.0,80.0,Video Games,0 - 30 minute,Morning,40000,0,50%,1.30 - 2 hour,30 - 60 minutes,Bad,Bad,0
1,0,Female,BCA,90.0,40.0,70.0,80.0,70.0,Cinema,30 - 60 minute,Morning,15000,1,75%,1 - 1.30 hour,0 - 30 minutes,Bad,Bad,0
2,1,Male,BCA,159.0,78.0,69.5,61.0,55.0,Cinema,1 - 2 Hour,Anytime,13000,1,50%,More than 2 hour,30 - 60 minutes,Awful,Bad,0
3,1,Female,BCA,147.0,20.0,70.0,59.0,58.0,Reading books,1 - 2 Hour,Anytime,1500000,0,50%,1.30 - 2 hour,0 - 30 minutes,Bad,good,0
4,0,Male,BCA,170.0,54.0,40.0,65.0,30.0,Video Games,30 - 60 minute,Morning,50000,1,25%,1.30 - 2 hour,30 - 60 minutes,Good,good,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 19 columns):
 #   Column                                                                               Non-Null Count  Dtype  
---  ------                                                                               --------------  -----  
 0   Have you completed any certification courses, or are you currently enrolled in any?  235 non-null    int64  
 1   Gender                                                                               235 non-null    object 
 2   Department                                                                           235 non-null    object 
 3   Height(CM)                                                                           235 non-null    float64
 4   Weight(KG)                                                                           235 non-null    float64
 5   10th Mark                                                                            235 non

Разделим на тест и трейн

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Do you like your degree?', axis = 1), df['Do you like your degree?'], test_size=0.2, random_state=7)

In [6]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    

class NumberSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key, drop_first=True).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key, drop_first=True)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [7]:
continuous_columns = X_train.select_dtypes(include='number').columns.to_list()
categorical_columns = X_train.select_dtypes(exclude='number').columns.to_list()

In [8]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                
            ])
    
    final_transformers.append((cont_col, cont_transformer))

In [9]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [10]:
model = xgb.XGBClassifier(random_state=7)

In [11]:
pipeline = Pipeline([
    ('features', feats),
    ('classifier', model)
])

In [12]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Department',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Department')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Department'))])),
                                                ('hobbies',
                                                 Pipeline(steps=[('selector',
       

In [13]:
y_predict = pipeline.predict(X_test)

In [14]:
results = {'model' : [], 'f1' : [], 'recall' : [], 'precision' : [] }

In [15]:
results['model'].append('commonXGB')
results['f1'].append(f1_score(y_test, y_predict))
results['recall'].append(recall_score(y_test, y_predict, average='binary'))
results['precision'].append(precision_score(y_test, y_predict, average='binary'))

4. Применить random negative sampling для построения классификатора в новых условиях

In [16]:
samples = np.linspace(0.1, 1, 10)

In [17]:
for i in samples:
    mod_data = X_train.copy()
    mod_data['label'] = y_train
    mod_data = mod_data.reset_index(drop=True)


    pos_ind = np.where(mod_data.iloc[:, -1].values == 1)[0]

    # shuffle them
    np.random.shuffle(pos_ind)
    
    perc = i
    pos_sample_len = int(np.ceil(perc * len(pos_ind)))

    
    pos_sample = pos_ind[:pos_sample_len]
    mod_data['class_test'] = -1
    mod_data.loc[pos_sample,'class_test'] = 1
   
    mod_data = mod_data.sample(frac=1)


    data_N = mod_data[mod_data['class_test'] == -1]
    data_P = mod_data[mod_data['class_test'] == 1]

    neg_sample = data_N[:data_P.shape[0]]
    sample_test = data_N[data_P.shape[0]:]
    pos_sample = data_P.copy()

    
    sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

    sample_train.loc[sample_train['class_test'] == -1, 'class_test'] = 0
    X_sample_train = sample_train.drop(columns=['class_test', 'label'])
    y_sample_train = sample_train['class_test'] 

    pipeline.fit(X_sample_train, y_sample_train)

    y_predict = pipeline.predict(X_test)

    results['model'].append(f'commonXGB+RNS_{i:.1f}sample')
    results['f1'].append(f1_score(y_test, y_predict))
    results['recall'].append(recall_score(y_test, y_predict, average='binary'))
    results['precision'].append(precision_score(y_test, y_predict, average='binary'))

In [18]:
pd.DataFrame(results)

Unnamed: 0,model,f1,recall,precision
0,commonXGB,0.965517,1.0,0.933333
1,commonXGB+RNS_0.1sample,0.676923,0.52381,0.956522
2,commonXGB+RNS_0.2sample,0.566667,0.404762,0.944444
3,commonXGB+RNS_0.3sample,0.65625,0.5,0.954545
4,commonXGB+RNS_0.4sample,0.724638,0.595238,0.925926
5,commonXGB+RNS_0.5sample,0.625,0.47619,0.909091
6,commonXGB+RNS_0.6sample,0.853333,0.761905,0.969697
7,commonXGB+RNS_0.7sample,0.810811,0.714286,0.9375
8,commonXGB+RNS_0.8sample,0.860759,0.809524,0.918919
9,commonXGB+RNS_0.9sample,0.941176,0.952381,0.930233
