In [22]:
import numpy as np
import pandas as pd
import dill
# import random

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_curve, roc_auc_score

# random.seed = 42

In [23]:
def get_metrics(y_test, preds):
    
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.nanargmax(fscore)
    
    return roc_auc_score(y_test, preds), precision[ix], recall[ix], fscore[ix], thresholds[ix]

### Данные:

https://www.kaggle.com/datasets/uciml/adult-census-income

Бинарная классификация: превысит ли доход 50 000 долларов, или нет.

**Описание датасета**

* **age** - возраст
* **workclass** - тип занятости
* **fnlwgt** - конечный вес (получается путем создания «взвешенных сумм» любых указанных социально-экономических характеристик населения. Люди со схожими демографическими характеристиками должны иметь одинаковый вес)
* **education** - образование, тип
* **education.num** - образование, лет
* **marital.status** - семейный статус
* **occupation** - род занятий
* **relationship** - отношения
* **race** - раса
* **sex** - пол
* **capital.gain** - прирост капитала
* **capital.loss** - убыток капитала
* **hours.per.week** - рабочих часов в неделю
* **native.country** - родина
* **income** - целевая переменная, доход <=50 или >50

In [24]:
path = 'app_data/'

In [25]:
df = pd.read_csv(path + 'adult.csv.zip')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [26]:
# есть пропуски - '?'
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


Целевая переменная:

In [27]:
df['target'] = df['income'].replace({'<=50K': 0, '>50K': 1})
# и уберём признаки, которые не вошли в ффинальную модель
df.drop(['income', 'sex', 'native.country'], axis=1, inplace=True)

Разбиение на test и train и сохранение на диск

In [28]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), 
                                                    df['target'], 
                                                    random_state=42)

X_test.to_csv(path + 'X_test.csv', index=None)
y_test.to_csv(path + 'y_test.csv', index=None)

X_train.to_csv(path + 'X_train.csv', index=None)
y_train.to_csv(path + 'y_train.csv', index=None)

### Pipeline

In [29]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
     
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

    
class PrunOutlier(BaseEstimator, TransformerMixin):
    def __init__(self, key, low=0, high=1, replace='median'):
        """
        варианты для замены: median, rnd_median, min, max
        """
        self.key = key
        self.low = low
        self.high = high
        self.replace = replace
        self.q_low = 0
        self.q_high = 0
        self.median = 0

    def fit(self, X, y=None):
        self.q_low = X[self.key].quantile(self.low)
        self.q_high = X[self.key].quantile(self.high)
        self.median = np.median(X[self.key])
        return self

    def transform(self, X):
        X_new = X[[self.key]].copy()
        if self.replace == 'median':
            X_new = X_new.where((X_new >= self.q_low) & (X_new <= self.q_high), 
                                self.median)
#         elif self.replace == 'rnd_median':
#             random.seed = 42
#             X_new = X_new.where((X_new >= self.q_low) & (X_new <= self.q_high), 
#                        self.median + random.randint(-100000, 100000))
        elif self.replace == 'min':
            X_new = X_new.where((X_new >= self.q_low), self.q_low)
        else:
            X_new = X_new.where((X_new <= self.q_high), self.q_high)                
        return X_new
 
 
class NumericDiff(BaseEstimator, TransformerMixin):
    """
    вычитает столбцы
    """
    def __init__(self, key):
        self.key = key
        self.columns = []
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        Xp = X[self.key].copy()
        self.columns = f'{self.key[0]}-{self.key[1]}'
    
        Xp[self.columns] = Xp.loc[:, self.key[0]].values - Xp.loc[:, self.key[1]].values        
        return Xp[[self.columns]]
    
    
class NumericDiffLog(BaseEstimator, TransformerMixin):
    """
    разность логарифмов
    """
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        Xp = X[self.key].copy()
        col = f'{self.key[0]}-{self.key[1]}_log'
    
        Xp[col] = np.log10((Xp.loc[:, self.key[0]].values + 1) 
                                     / (Xp.loc[:, self.key[1]].values + 1))        
        return Xp[[col]]

    
class NumericBins(BaseEstimator, TransformerMixin):
    """
    бины на основе квантилей
    """
    def __init__(self, key, q):
        self.key = key
        self.q = q
        self.bins = []
        
    def fit(self, X, y=None):
        self.bins = pd.qcut(X[self.key], self.q, duplicates='drop', retbins=True)
        self.bins = self.bins[-1][1:].astype('int')
        return self

    def transform(self, X):
        col = self.key.join('_bins')
        Xb = X[[self.key]].copy()
    
        Xb[col] = pd.cut(Xb[self.key], bins=self.bins, labels=False) + 1
        Xb[col].fillna(0, inplace=True)
        return Xb[[col]]

In [30]:
def to_repl(X, repl_dict):
    return pd.DataFrame(X.replace(repl_dict))

In [31]:
# словари для замены
workclass_dict = {'Self-emp-not-inc': 'Not_inc',
                  'Without-pay': 'Not_inc',
                  'Never-worked': 'Not_inc',
                  'Local-gov': 'Goverment',
                  'State-gov': 'Goverment',
                  'Federal-gov': 'Goverment'
                 }    
    
education_dict = {'Preschool': 'Shool', 
                  '1st-4th': 'Shool', 
                  '5th-6th': 'Shool',
                  '7th-8th': 'Shool',
                  '9th': 'Shool',
                  '10th': 'Shool',
                  '11th': 'Shool',
                  '12th': 'Shool',
                  'Assoc-voc': 'to_Bachelors',
                  'Assoc-acdm': 'to_Bachelors',
                  'Bachelors': 'to_Bachelors',
                  'Masters': 'to_Doctorate',
                  'Prof-school': 'to_Doctorate',
                  'Doctorate': 'to_Doctorate',
                 }

relationship_dict = {'Husband': 'spouse', 
                     'Not-in-family': 'solo', 
                     'Own-child': 'other', 
                     'Unmarried': 'solo', 
                     'Wife': 'spouse', 
                     'Other-relative': 'other'}

marital_bin = {'Married-civ-spouse': 1, 
               'Never-married': 0, 
               'Divorced': 0, 
               'Widowed': 0, 
               'Separated': 0, 
               'Married-spouse-absent': 0, 
               'Married-AF-spouse': 1}

relationship_bin = {'Husband': 1, 
                    'Not-in-family': 0, 
                    'Own-child': 1, 
                    'Unmarried': 0, 
                    'Wife': 1, 
                    'Other-relative': 1} 

In [32]:
all_transformers = []

# просто отбираются + StSc
cont_cols = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
for cont_col in cont_cols:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
#                 ('standard', StandardScaler())
            ])
    all_transformers.append((cont_col, cont_transformer))

# sqrt + StSc: 'age_sqrt', 'fnlwgt_sqrt'  # просто функция
sqrt_cols = ['age', 'fnlwgt']
for sqrt_col in sqrt_cols:
    sqrt_transf = Pipeline([
                ('selector', NumberSelector(key=sqrt_col)),
                ('sqrt_transformer', FunctionTransformer(np.sqrt)), 
#                 ('standard', StandardScaler())
            ])
    all_transformers.append((f'{sqrt_col}_sqrt', sqrt_transf))
    
# для fnlwgt_rnd_med
fnlwgt_rnd_med_tr = Pipeline([
                ('selector', NumberSelector(key='fnlwgt')),
                ('fnlwgt_rnd_med', PrunOutlier(key='fnlwgt', high=0.95, replace='median')), 
#                 ('standard', StandardScaler())
            ])
all_transformers.append(('fnlwgt_rnd_med', fnlwgt_rnd_med_tr))

# 'education.num_med'
ed_num_med_transf = Pipeline([
                ('selector', NumberSelector(key='education.num')),
                ('education.num_med', PrunOutlier(key='education.num', low=0.05, replace='median')), 
#                 ('standard', StandardScaler())
            ])
all_transformers.append(('education.num_med', ed_num_med_transf))

# 'education.num_min'
ed_num_min_transf = Pipeline([
                ('selector', NumberSelector(key='education.num')),
                ('education.num_min', PrunOutlier(key='education.num', low=0.05, replace='min')), 
#                 ('standard', StandardScaler())
            ])
all_transformers.append(('education.num_min', ed_num_min_transf))


# 'capital_gl', 
capital_gl_transf = Pipeline([
                ('sum_transf', NumericDiff(key=['capital.gain', 'capital.loss'])), 
#                 ('standard', StandardScaler())
            ])
all_transformers.append(('capital_gl', capital_gl_transf))

# 'capital_gl_max', 
capital_gl_max_transf = Pipeline([
                ('sum_transf', NumericDiff(key=['capital.gain', 'capital.loss'])),
                ('capital_gl_max', PrunOutlier(key='capital.gain-capital.loss', 
                                                  high=.995, replace='max')), 
#                 ('standard', StandardScaler())
            ])
all_transformers.append(('capital_gl_max', capital_gl_max_transf))

#'capital_gl_log',
capital_gl_log_transf = Pipeline([
                ('sum_transf', NumericDiffLog(key=['capital.gain', 'capital.loss'])),
#                 ('standard', StandardScaler())
            ])
all_transformers.append(('capital_gl_log', capital_gl_log_transf))

# 'hours.per.week_bins'
h_per_week_bins_transf = Pipeline([
                ('selector', NumberSelector(key='hours.per.week')),
                ('sum_transf', NumericBins(key='hours.per.week', q=45)),
#                 ('standard', StandardScaler())
            ])
all_transformers.append(('hours.per.week_bins', h_per_week_bins_transf))

# onehot
onehot_cols = ['workclass', 'marital.status', 'relationship', 'occupation', 'race']
for cat_col in onehot_cols:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    all_transformers.append((cat_col, cat_transformer))

# comb + onehot:
onehot_dicts = [workclass_dict, education_dict, relationship_dict]
onehot_comb_cols = ['workclass', 'education', 'relationship']
for comb_col, comb_dict in zip(onehot_comb_cols, onehot_dicts):
    onehot_comb_transformer = Pipeline([
                ('selector', FeatureSelector(column=comb_col)),
                ('comb_transf', FunctionTransformer(to_repl, kw_args={'repl_dict': comb_dict})),
                ('ohe', OHEEncoder(key=comb_col))
            ])
    all_transformers.append((comb_col.join('_comb'), onehot_comb_transformer))

# 1 столбец - 2 кат:  marital_with_spouse и relationship_with
cols_with = {'marital.status': marital_bin, 'relationship': relationship_bin}
for key, value in cols_with.items():
    with_transf =  Pipeline([
                    ('selector', FeatureSelector(column=key)),
                    ('bin_transf', FunctionTransformer(to_repl, kw_args={'repl_dict': value}))
    ])
    all_transformers.append((f'{key}_with', with_transf))


feats = FeatureUnion(all_transformers)

In [33]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', GradientBoostingClassifier(random_state=42,
                                              learning_rate=0.173,
                                              n_estimators=250,
                                              subsample=0.8,
                                              min_samples_split=5,
                                              max_features='sqrt'))
])

pipeline.fit(X_train, y_train)

  X_new = X_new.where((X_new >= self.q_low) & (X_new <= self.q_high),
  X_new = X_new.where((X_new >= self.q_low) & (X_new <= self.q_high),
  X_new = X_new.where((X_new >= self.q_low), self.q_low)
  X_new = X_new.where((X_new <= self.q_high), self.q_high)


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('age',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='age'))])),
                                                ('fnlwgt',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='fnlwgt'))])),
                                                ('education.num',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='education.num'))])),
                                                ('capital.gain',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='capita...
             

Сохранение модели

In [34]:
with open('pipeline.dill', 'wb') as f:
    dill.dump(pipeline, f)

### Предсказание

In [35]:
X_test = pd.read_csv(path + 'X_test.csv')
y_test = pd.read_csv(path + 'y_test.csv')

In [36]:
X_test.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,capital.gain,capital.loss,hours.per.week
0,29,Private,280618,Some-college,10,Married-civ-spouse,Handlers-cleaners,Husband,White,0,0,40
1,19,Private,439779,Some-college,10,Never-married,Sales,Own-child,White,0,0,15
2,28,Private,204734,Some-college,10,Married-civ-spouse,Tech-support,Wife,White,0,0,40


In [37]:
with open('pipeline.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)

In [38]:
pipeline

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('age',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='age'))])),
                                                ('fnlwgt',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='fnlwgt'))])),
                                                ('education.num',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='education.num'))])),
                                                ('capital.gain',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='capita...
             

In [39]:
predict = pipeline.predict_proba(X_test)[:, 1]

roc_auc, prec, rec, f_score, th = get_metrics(y_test, predict)
print(f'roc_auc: {roc_auc:.4f}, prec: {prec:.4f}, rec: {rec:.4f},'
      f' f-score: {f_score:.4f}, th: {th:.4f}')

roc_auc: 0.9245, prec: 0.6721, rec: 0.7824, f-score: 0.7231, th: 0.3299


  X_new = X_new.where((X_new >= self.q_low) & (X_new <= self.q_high),
  X_new = X_new.where((X_new >= self.q_low) & (X_new <= self.q_high),
  X_new = X_new.where((X_new >= self.q_low), self.q_low)
  X_new = X_new.where((X_new <= self.q_high), self.q_high)
