# Шаг 3: Построение моделей BaseLine

In [334]:
folder = '1. BaseLine'

## Импорт библиотек, константы и загрузка данных

In [335]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option("display.max_columns", 50)
pd.options.display.max_rows = 20

import numpy as np

import seaborn as sns
sns.set_style("darkgrid")  
sns.set_context("talk", font_scale=0.6)

import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams.update(
    {"lines.linewidth": 1, "patch.facecolor": "#ebe3df", "axes.facecolor": "#ebe3df"})

from tqdm import tqdm
from tqdm import tqdm_notebook


import association_metrics as am

In [336]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [337]:
# Базовые модели
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

# Выбор параметров
from sklearn.model_selection import cross_val_score, StratifiedKFold

import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

from sklearn.metrics import roc_auc_score
from mlxtend.evaluate import bias_variance_decomp

In [338]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import category_encoders as ce

In [339]:
RANDOM_STATE = 42

## Загрузка данных

In [340]:
initial_X_y = pd.read_pickle('../data/initial_X_y.pkl')
X = initial_X_y['X'].copy()
y = initial_X_y['y'].copy()

In [341]:
X.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,No,Yes,No,1.0,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,Male,No,No,No,34.0,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,Male,No,No,No,2.0,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,Male,No,No,No,45.0,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75
4,Female,No,No,No,2.0,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65


In [342]:
y.head()

0     No
1     No
2    Yes
3     No
4    Yes
Name: Churn, dtype: object

## Разделение на тестовую и обучающую выборки

Выделим категориальные и числовые признаки

In [343]:
numeric_cols = list(X.select_dtypes(include=[np.number]).columns)
display(X[numeric_cols].head(3))

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,1.0,29.85,29.85
1,34.0,56.95,1889.5
2,2.0,53.85,108.15


In [344]:
categoric_cols = list(X.select_dtypes(exclude=[np.number]).columns)
display(X[categoric_cols].head(3))

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod
0,Female,No,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check
1,Male,No,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check
2,Male,No,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check


Разделим исходный датасет на обучающую и тестовую выборки

In [345]:
X, y = shuffle(X, y, random_state=RANDOM_STATE)

In [346]:
y = y.map({'Yes':1, 'No':0})

display(y.value_counts())

0    5174
1    1869
Name: Churn, dtype: int64

In [347]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    stratify=y, 
                                                    random_state=RANDOM_STATE)

print(f"Размер обучающей выборки: {X_train.shape}")
print(f"Размер тестовой выборки:  {X_test.shape}")

Размер обучающей выборки: (4930, 19)
Размер тестовой выборки:  (2113, 19)


## Pipeline configurations

#### Сформируем Pipeline для линейной модели класса Логистическая регрессия

Для исключения линейных зависимостей исключим следующие признаки: `TotalCharges`, `PhoneService` по результатам EDA

In [348]:
def get_LogReg_pipe():
    
    '''
    Pipeline модели LogisticRegression
    '''
    
    categoric_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
    numeric_cols = ['tenure', 'MonthlyCharges']


    # Трансформер числовых признаков
    numeric_pipe = Pipeline([
            ('imputer', SimpleImputer()),
            ('scaler', StandardScaler())])

    # Трансформер категориальных признаков
    categoric_pipe = Pipeline([
            ('OHE', OneHotEncoder(drop='first', sparse=False))])

    transformers = [("numeric", numeric_pipe, numeric_cols), 
                    ("categoric", categoric_pipe, categoric_cols)]

    preprocessor = ColumnTransformer(transformers=transformers)


    log_reg_pipe = Pipeline([
            ('Preprocessor', preprocessor),
            ('Model', LogisticRegression())])
    
    return log_reg_pipe

In [349]:
get_LogReg_pipe()

#### Сформируем Pipeline для нелинейных моделей класса RandomForest и lightgbm

In [350]:
def get_RF_pipe():
    
    '''
    Pipeline модели RandomForestClassifier
    '''
    
    categoric_cols = ['gender',
     'SeniorCitizen',
     'Partner',
     'Dependents',
     'PhoneService',
     'MultipleLines',
     'InternetService',
     'OnlineSecurity',
     'OnlineBackup',
     'DeviceProtection',
     'TechSupport',
     'StreamingTV',
     'StreamingMovies',
     'Contract',
     'PaperlessBilling',
     'PaymentMethod']

    numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']


    # Трансформер числовых признаков
    numeric_pipe = Pipeline([
            ('imputer', SimpleImputer()),
            ('scaler', StandardScaler())])

    # Трансформер категориальных признаков
    categoric_pipe = Pipeline([
            ('WOE', ce.WOEEncoder())])

    transformers = [("numeric", numeric_pipe, numeric_cols), 
                    ("categoric", categoric_pipe, categoric_cols)]

    preprocessor = ColumnTransformer(transformers=transformers)


    RF_pipe = Pipeline([
            ('Preprocessor', preprocessor),
            ('Model', RandomForestClassifier())])
    
    return RF_pipe

In [351]:
get_RF_pipe()

In [352]:
def get_LGBM_pipe():
    
    '''
    Pipeline модели LGBMClassifier
    '''
    
    categoric_cols = ['gender',
     'SeniorCitizen',
     'Partner',
     'Dependents',
     'PhoneService',
     'MultipleLines',
     'InternetService',
     'OnlineSecurity',
     'OnlineBackup',
     'DeviceProtection',
     'TechSupport',
     'StreamingTV',
     'StreamingMovies',
     'Contract',
     'PaperlessBilling',
     'PaymentMethod']

    numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']


    # Трансформер числовых признаков
    numeric_pipe = Pipeline([
            ('imputer', SimpleImputer()),
            ('scaler', StandardScaler())])

    # Трансформер категориальных признаков
    categoric_pipe = Pipeline([
            ('WOE', ce.WOEEncoder())])

    transformers = [("numeric", numeric_pipe, numeric_cols), 
                    ("categoric", categoric_pipe, categoric_cols)]

    preprocessor = ColumnTransformer(transformers=transformers)


    LGBM_pipe = Pipeline([
            ('Preprocessor', preprocessor),
            ('Model', LGBMClassifier())])
    
    return LGBM_pipe

In [353]:
get_LGBM_pipe()

## Поиск наилучших праметров моделей

На данном шаге будем осуществлять поиск наилучших параметров моделей, при этом:
- Поиск будет осуществляться с помощью библиотеки Optuna;
- Поиск будет осуществляться только на обучающей (train) выборке. Финальное качество будем проверять на тестовой выборке, которая в подборе параметров и при обучении не участвовала.

In [354]:
# Создаем словари, в которые будем записывать результаты поиска оптимальных мараметров

optuna_model_results = dict()   # словарь наилучших параметров
model_scores = dict()           # словарь оценок качества на обучающей и тестовой выборках

Подготовим функцию для проведения оценки качества на тестовой выборке

In [355]:
def test_Model(description: str, model, X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series):
    
    
    print('\n')
    print(f'***{description}***')
    
 
    print('Step 1: Обучение')
    results = pd.DataFrame()
    
    model.fit(X_train, y_train)
    
    print('Step 2: Предсказание на train')
    pred = model.predict_proba(X_train)[:,1]
    
    train_score = roc_auc_score(y_train, pred)
    
    print('Step 3: Предсказание на test')
    pred = model.predict_proba(X_test)[:,1]
    
    test_score = roc_auc_score(y_test, pred)
    
    print('Step 4: bias_variance_decomp')
    
#     prepr = model['Preprocessor']
#     X_train_local = prepr.fit_transform(X_train, y_train)
#     X_test_local = prepr.transform(X_test)

#     avg_mse, avg_bias, avg_var = bias_variance_decomp(model['Model'], 
#                                                       X_train_local, y_train.values,
#                                                       X_test_local, y_test.values, 
#                                                       loss = '0-1_loss',
#                                                       random_seed=np.random.seed(RANDOM_STATE),)
    
    results['train_score'] = [train_score]
    results['test_score']  = [test_score]
#     results['avg_bias']    = [avg_bias]
#     results['avg_var']     = [avg_var]
    
    results.index = [description]
    
    print('***Done***')
    return results

#### Поиск параметров для модели LogisticRegression и оценка качества

Посмотрим на выход этапа *Preprocessor*

In [372]:
pd.DataFrame(get_LogReg_pipe()['Preprocessor'].fit_transform(X_train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
0,0.719571,-0.825738,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-0.744836,0.321939,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,-0.663480,0.983678,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.109401,-0.479113,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-1.029582,0.348475,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4925,1.573809,-1.295092,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4926,-1.029582,-1.470892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4927,-0.948226,-0.308288,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4928,0.109401,-1.462600,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


Осуществим поиск наилучших параметров

In [356]:
def objective_LogReg(trial):
    
    
    
    parameters_grid = {
                   'Model__penalty':      trial.suggest_categorical('Model__penalty', ['l2']),
                   'Model__solver':       trial.suggest_categorical('Model__solver', ['lbfgs', 'liblinear']),
                   'Model__C':            trial.suggest_float('Model__C', low=0.1, high=30, step=0.1),
                   'Model__class_weight': trial.suggest_categorical('Model__class_weight', ['balanced', None])      
                    }
    
    list_scores = []
    
    # Препроцессинг
    preprocessor = get_LogReg_pipe()['Preprocessor']
    preprocessor.fit(X_train)
    
    X_train_local = preprocessor.transform(X_train)
    
    
    # Обучение и предсказание
    for rand in [22, 34, 55, 888, 123]:
        
        X_train_temp, X_valid, y_train_temp, y_valid = train_test_split(X_train_local, y_train, 
                                                    test_size=0.3, 
                                                    stratify=y_train, 
                                                    random_state=rand)
        
        model = get_LogReg_pipe().set_params(**parameters_grid)['Model']
        model.fit(X_train_temp, y_train_temp)
        
        pred = model.predict_proba(X_valid)[:,1]

        score = roc_auc_score(y_valid, pred)
        list_scores.append(score)
    
    score = np.mean(list_scores)

    return score

In [357]:
study = optuna.create_study(direction="maximize")
study.optimize(objective_LogReg, 
               n_trials=100, 
               n_jobs=-1, 
               show_progress_bar=True, )

optuna_model_results['LogisticRegression'] = study.best_params
display(study.best_params)

  0%|          | 0/100 [00:00<?, ?it/s]

{'Model__penalty': 'l2',
 'Model__solver': 'liblinear',
 'Model__C': 0.2,
 'Model__class_weight': None}

Теперь оценим качество построенной модели на обучающей и тестовой выборках для:
- модели с базовыми параметрами;
- модели с найденными параметрами

In [358]:
params = {
    'description': 'LogisticRegression()',
    'model': get_LogReg_pipe(),
    'X_train': X_train,
    'y_train': y_train,
    
    'X_test': X_test,
    'y_test': y_test
}

model_scores['LogReg'] = test_Model(**params)
model_scores['LogReg']



***LogisticRegression()***
Step 1: Обучение
Step 2: Предсказание на train
Step 3: Предсказание на test
Step 4: bias_variance_decomp
***Done***


Unnamed: 0,train_score,test_score
LogisticRegression(),0.849798,0.836186


In [359]:
params = {
    'description': 'LogisticRegression(opt)',
    'model': get_LogReg_pipe().set_params(**optuna_model_results['LogisticRegression']),
    'X_train': X_train,
    'y_train': y_train,
    'X_test': X_test,
    'y_test': y_test
}

model_scores['LogReg_opt'] = test_Model(**params)



***LogisticRegression(opt)***
Step 1: Обучение
Step 2: Предсказание на train
Step 3: Предсказание на test
Step 4: bias_variance_decomp
***Done***


#### Поиск параметров для модели RandomForestClassifier и оценка качества

In [360]:
def objective_RandomForest(trial):
    
    parameters_grid = {
        'Model__criterion':    trial.suggest_categorical   ('Model__criterion', ["gini", "entropy", "log_loss"]),
        'Model__n_estimators': trial.suggest_int           ('Model__n_estimators', low=10, high=300, step=1),
        'Model__class_weight': trial.suggest_categorical   ('Model__class_weight', ["balanced_subsample", 'balanced']),
        'Model__max_samples':  trial.suggest_categorical   ('Model__max_samples', [None, 1.0, 0.5]),
        'Model__max_features': trial.suggest_categorical   ('Model__max_features', ["sqrt", "log2", None]),
        'Model__max_depth':    trial.suggest_categorical   ('Model__max_depth', [10, None]),
    }
    
    
    list_scores = []
    
    # Препроцессинг
    preprocessor = get_RF_pipe()['Preprocessor']
    preprocessor.fit(X_train, y_train)
    
    X_train_local = preprocessor.transform(X_train)
    
    
    # Обучение и предсказание
    for rand in [22, 34, 55, 888, 123]:
        
        X_train_temp, X_valid, y_train_temp, y_valid = train_test_split(X_train_local, y_train, 
                                                    test_size=0.3, 
                                                    stratify=y_train, 
                                                    random_state=rand)
        
        model = get_RF_pipe().set_params(**parameters_grid)['Model']
        model.fit(X_train_temp, y_train_temp)
        
        pred = model.predict_proba(X_valid)[:,1]

        score = roc_auc_score(y_valid, pred)
        list_scores.append(score)
    
    score = np.mean(list_scores)

    return score

In [361]:
study = optuna.create_study(direction="maximize")
study.optimize(objective_RandomForest, n_trials=20, n_jobs=-1, show_progress_bar=True)

optuna_model_results['RandomForestClassifier'] = study.best_params
display(study.best_params)

  0%|          | 0/20 [00:00<?, ?it/s]

{'Model__criterion': 'log_loss',
 'Model__n_estimators': 155,
 'Model__class_weight': 'balanced_subsample',
 'Model__max_samples': None,
 'Model__max_features': 'sqrt',
 'Model__max_depth': 10}

Теперь оценим качество построенной модели на обучающей и тестовой выборках для:
- модели с базовыми параметрами;
- модели с найденными параметрами

In [362]:
params = {
    'description': 'RandomForestClassifier()',
    'model': get_RF_pipe(),
    'X_train': X_train,
    'y_train': y_train,
    'X_test': X_test,
    'y_test': y_test
}

model_scores['RF'] = test_Model(**params)



***RandomForestClassifier()***
Step 1: Обучение
Step 2: Предсказание на train
Step 3: Предсказание на test
Step 4: bias_variance_decomp
***Done***


In [363]:
params = {
    'description': 'RandomForestClassifier(opt)',
    'model': get_RF_pipe().set_params(**optuna_model_results['RandomForestClassifier']),
    'X_train': X_train,
    'y_train': y_train,
    'X_test': X_test,
    'y_test': y_test
}

model_scores['RF_opt'] = test_Model(**params)



***RandomForestClassifier(opt)***
Step 1: Обучение
Step 2: Предсказание на train
Step 3: Предсказание на test
Step 4: bias_variance_decomp
***Done***


#### Поиск параметров для модели LGBMClassifier и оценка качества

In [364]:
def objective_Lgbm(trial):
    
    parameters_grid = {
        'Model__max_depth':     trial.suggest_int("Model__max_depth", 2, 40),
        'Model__learning_rate': trial.suggest_float("Model__learning_rate", 1e-5, 1, log=True),
        'Model__n_estimators':  trial.suggest_int("Model__n_estimators", 10, 1000),
        'Model__class_weight':  trial.suggest_categorical   ('Model__class_weight', ['balanced', None]),
        'Model__reg_lambda':    trial.suggest_float('Model__reg_lambda', 0, 10),
    }
    
#     model = get_LGBM_pipe()
#     cv = StratifiedKFold(n_splits=10)
#     try: score = cross_val_score(model.set_params(**parameters_grid), 
#                             X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1).mean()
#     except: 
#         score=0;

    
    list_scores = []
    
    # Препроцессинг
    preprocessor = get_LGBM_pipe()['Preprocessor']
    preprocessor.fit(X_train, y_train)
    
    X_train_local = preprocessor.transform(X_train)
    
    
    # Обучение и предсказание
    for rand in [22, 34, 55, 888, 123]:
        
        X_train_temp, X_valid, y_train_temp, y_valid = train_test_split(X_train_local, y_train, 
                                                    test_size=0.3, 
                                                    stratify=y_train, 
                                                    random_state=rand)
        
        model = get_LGBM_pipe().set_params(**parameters_grid)['Model']
        model.fit(X_train_temp, y_train_temp)
        
        pred = model.predict_proba(X_valid)[:,1]

        score = roc_auc_score(y_valid, pred)
        list_scores.append(score)
    
    score = np.mean(list_scores)

    return score

In [365]:
study = optuna.create_study(direction="maximize")
study.optimize(objective_Lgbm, n_trials=30, n_jobs=-1, show_progress_bar=True)


optuna_model_results['LGBMClassifier'] = study.best_params
display(study.best_params)

  0%|          | 0/30 [00:00<?, ?it/s]

{'Model__max_depth': 31,
 'Model__learning_rate': 0.0023696078639296003,
 'Model__n_estimators': 995,
 'Model__class_weight': 'balanced',
 'Model__reg_lambda': 8.507898846076417}

Теперь оценим качество построенной модели на обучающей и тестовой выборках для:
- модели с базовыми параметрами;
- модели с найденными параметрами

In [366]:
params = {
    'description': 'LGBMClassifier()',
    'model': get_LGBM_pipe().set_params(**{'Model__verbosity': -1}),
    'X_train': X_train,
    'y_train': y_train,
    'X_test': X_test,
    'y_test': y_test
}

model_scores['LGBM'] = test_Model(**params)



***LGBMClassifier()***
Step 1: Обучение
Step 2: Предсказание на train
Step 3: Предсказание на test
Step 4: bias_variance_decomp
***Done***


In [367]:
params = {
    'description': 'LGBMClassifier(opt)',
    'model': get_LGBM_pipe().set_params(**optuna_model_results['LGBMClassifier'], **{'Model__verbosity': -1}),
    'X_train': X_train,
    'y_train': y_train,
    'X_test': X_test,
    'y_test': y_test
}

model_scores['LGBM_opt'] = test_Model(**params)



***LGBMClassifier(opt)***
Step 1: Обучение
Step 2: Предсказание на train
Step 3: Предсказание на test
Step 4: bias_variance_decomp
***Done***


#### Посмотрим на результаты

In [368]:
BaseLine_scores = pd.DataFrame()
for i in model_scores.keys():
    BaseLine_scores = pd.concat([BaseLine_scores, model_scores[i]])
    
BaseLine_scores

Unnamed: 0,train_score,test_score
LogisticRegression(),0.849798,0.836186
LogisticRegression(opt),0.849487,0.835931
RandomForestClassifier(),0.999946,0.811993
RandomForestClassifier(opt),0.958205,0.835014
LGBMClassifier(),0.956737,0.830529
LGBMClassifier(opt),0.884642,0.836615


Сохраним полученные результаты:

In [374]:
BaseLine_scores.to_pickle(f'../results/{folder}/scores.pkl')
pd.to_pickle(optuna_model_results, f'../results/{folder}/optuna_model_results.pkl')