In [1]:
import pandas as pd
import numpy as np
from functools import partial
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK, Trials

import mlflow
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score 

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

RANDOM_SEED = 42

In [39]:
df_final = pd.read_csv('../data/df_final_sc.csv')
df_final.head()

Unnamed: 0,Attrition,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Gender_Female,Gender_Male,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes,Age_category_Middle,Age_category_Old,Age_category_Young,DailyRate_category_High,DailyRate_category_Middle,DailyRate_category_Small,DistanceFromHome_category_Close,DistanceFromHome_category_Far,DistanceFromHome_category_Middle,Education_category_High,Education_category_Middle,Education_category_Small,EnvironmentSatisfaction_category_Doesnot_Satisfied,EnvironmentSatisfaction_category_Middle,HourlyRate_category_High,HourlyRate_category_Middle,HourlyRate_category_Small,JobInvolvement_category_High,JobInvolvement_category_Middle,JobInvolvement_category_Small,JobLevel_category_High,JobLevel_category_Middle,JobLevel_category_Small,JobSatisfaction_category_Doesnot_Satisfied,JobSatisfaction_category_Middle,MonthlyIncome_category_High,MonthlyIncome_category_Middle,MonthlyIncome_category_Small,MonthlyRate_category_High,MonthlyRate_category_Middle,MonthlyRate_category_Small,NumCompaniesWorked_category_Few,NumCompaniesWorked_category_Many,NumCompaniesWorked_category_Middle,PercentSalaryHike_category_High,PercentSalaryHike_category_Middle,PercentSalaryHike_category_Small,PerformanceRating_category_High,PerformanceRating_category_Small,RelationshipSatisfaction_category_Doesnot_Satisfied,RelationshipSatisfaction_category_Middle,StockOptionLevel_category_High,StockOptionLevel_category_Middle,StockOptionLevel_category_Small,TotalWorkingYears_category_Few,TotalWorkingYears_category_Many,TotalWorkingYears_category_Middle,TrainingTimesLastYear_category_Few,TrainingTimesLastYear_category_Many,TrainingTimesLastYear_category_Middle,WorkLifeBalance_category_High,WorkLifeBalance_category_Middle,WorkLifeBalance_category_Small,YearsAtCompany_category_Few,YearsAtCompany_category_Many,YearsAtCompany_category_Middle,YearsInCurrentRole_category_Few,YearsInCurrentRole_category_Many,YearsInCurrentRole_category_Middle,YearsSinceLastPromotion_category_Few,YearsSinceLastPromotion_category_Many,YearsSinceLastPromotion_category_Middle,YearsWithCurrManager_category_Few,YearsWithCurrManager_category_Many,YearsWithCurrManager_category_Middle,std_Age,std_DailyRate,std_DistanceFromHome,std_Education,std_EnvironmentSatisfaction,std_HourlyRate,std_JobInvolvement,std_JobLevel,std_JobSatisfaction,std_MonthlyIncome,std_MonthlyRate,std_NumCompaniesWorked,std_PercentSalaryHike,std_PerformanceRating,std_RelationshipSatisfaction,std_StockOptionLevel,std_TotalWorkingYears,std_TrainingTimesLastYear,std_WorkLifeBalance,std_YearsAtCompany,std_YearsInCurrentRole,std_YearsSinceLastPromotion,std_YearsWithCurrManager
0,1,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,1,0,1,0,0,0,0,1,1,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,1,0.44635,0.742527,-1.010909,-0.891688,-0.660531,1.383138,0.379672,-0.057788,1.153254,-0.10835,0.72602,2.125136,-1.150554,-0.42623,-1.584178,-0.932014,-0.421642,-2.171982,-2.49382,-0.164613,-0.063296,-0.679146,0.245834
1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,1.322365,-1.297775,-0.14715,-1.868426,0.254625,-0.240677,-1.026167,-0.057788,-0.660853,-0.291719,1.488876,-0.678049,2.129306,2.346151,1.191438,0.241988,-0.164511,0.155707,0.338096,0.488508,0.764998,-0.368715,0.806541
2,1,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0.008343,1.414363,-0.887515,-0.891688,1.169781,1.284725,-1.026167,-0.961486,0.2462,-0.937654,-1.674841,1.324226,-0.057267,-0.42623,-0.658973,-0.932014,-0.550208,0.155707,0.338096,-1.144294,-1.167687,-0.679146,-1.155935
3,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,1,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,1,0,0,-0.429664,1.461466,-0.764121,1.061787,1.169781,-0.486709,0.379672,-0.961486,0.2462,-0.763634,1.243211,-0.678049,-1.150554,-0.42623,0.266233,-0.932014,-0.421642,0.155707,0.338096,0.161947,0.764998,0.252146,-1.155935
4,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,-1.086676,-0.524295,-0.887515,-1.868426,-1.575686,-1.274014,0.379672,-0.961486,-0.660853,-0.644858,0.3259,2.525591,-0.877232,-0.42623,1.191438,0.241988,-0.678774,0.155707,0.338096,-0.817734,-0.615492,-0.058285,-0.595227


In [40]:
df_final.shape

(1470, 117)

In [41]:
df_train, df_test = train_test_split(df_final, stratify=df_final['Attrition'], test_size=0.2, random_state=RANDOM_SEED)

X_train = df_train.drop('Attrition', axis = 1)
y_train = df_train['Attrition']

X_test = df_test.drop('Attrition', axis = 1)
y_test = df_test['Attrition']

In [5]:
def objective(params):
    classifier_type = params['type']
    del params['type']
    if classifier_type == 'logreg':
        clf = LogisticRegression(**params, n_jobs=-1, random_state=RANDOM_SEED)
    else:
        return 0
    accuracy = cross_val_score(clf, X_train, y_train).mean()
    
    # Because fmin() tries to minimize the objective, this function must return the negative accuracy. 
    return {'loss': -accuracy, 'status': STATUS_OK}

In [17]:
search_space = {
        'C': hp.quniform('LR_C', 0.0, 1.0, 0.1),
        'solver': hp.choice('solver', ['liblinear', 'lbfgs'])
    }

In [18]:
def objective(space):
    params = {
        'C': float(space['C']),
        'solver': space['solver']
        }
    
    clf = LogisticRegression(**params, n_jobs=-1, random_state=RANDOM_SEED)
   
    accuracy = cross_val_score(clf, X_train, y_train, scoring='accuracy', n_jobs= -1).mean()
    print('SCORE:', accuracy)
    # Because fmin() tries to minimize the objective, this function must return the negative accuracy. 
    return {'loss': -accuracy, 'status': STATUS_OK}

In [19]:
trials = Trials()

In [20]:
with mlflow.start_run():
    best_hyperparams = fmin(fn = objective,
                       space=search_space,
                        algo = tpe.suggest,
                        max_evals = 10,
                        trials = trials,
                        show_progressbar=True
                       )

SCORE:                                                
0.8903281644428416                                    
SCORE:                                                
0.8903281644428416                                                               
SCORE:                                                                           
0.8852253876667868                                                               
SCORE:                                                                           
0.8886260367832671                                                               
SCORE:                                                                           
0.8877749729534798                                                               
SCORE:                                                                           
0.8911792282726289                                                               
SCORE:                                                                           
0.8852253876667

In [23]:
import hyperopt

In [24]:
print(hyperopt.space_eval(search_space, best_hyperparams))

{'C': 0.30000000000000004, 'solver': 'lbfgs'}


In [57]:
search_space = {
        'alpha': hp.quniform('alpha', 0.0, 100.0, 0.1),
        'batch_size': hp.quniform('batch_size', 10, 1000, 1),
        'activation': hp.choice('activation', ['identity', 'logistic', 'tanh', 'relu']),
        'solver': hp.choice('solver', ['lbfgs', 'sgd', 'adam']),
        'learning_rate': hp.choice('learning_rate', ['constant', 'invscaling', 'adaptive']),
    }

In [58]:
def objective(space):
    params = {
        'alpha': int(space['alpha']),
        'batch_size' : int(space['batch_size']),
        'activation': space['activation'],
        'solver': space['solver'],
        'learning_rate': space['learning_rate'],
        }
    
    clf = MLPClassifier(**params, shuffle = True, early_stopping = True, random_state = RANDOM_SEED)
   
    accuracy = cross_val_score(clf, X_train, y_train, scoring='accuracy', n_jobs= -1).mean()
    #print('SCORE:', accuracy)
    # Because fmin() tries to minimize the objective, this function must return the negative accuracy. 
    return {'loss': -accuracy, 'status': STATUS_OK}

In [59]:
trials = Trials()

In [60]:
with mlflow.start_run():
    best_hyperparams = fmin(fn = objective,
                       space=search_space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials,
                        show_progressbar=True
                       )

100%|██████████| 100/100 [00:25<00:00,  3.96trial/s, best loss: -0.8911792282726289]


In [61]:
print(hyperopt.space_eval(search_space, best_hyperparams))

{'activation': 'identity', 'alpha': 8.3, 'batch_size': 448.0, 'learning_rate': 'invscaling', 'solver': 'lbfgs'}


In [62]:
mlp_model = MLPClassifier(activation = 'identity', alpha = 8.3, batch_size = 448, learning_rate = 'invscaling', solver = 'lbfgs', shuffle = True, early_stopping = True, random_state = RANDOM_SEED)

In [63]:
mlp_model.fit(X_train, y_train)
y_pred_train = mlp_model.predict(X_train)
y_pred_test =mlp_model.predict(X_test)
y_pred_prob_test = mlp_model.predict_proba(X_test) 
accuracy_score_test = accuracy_score(y_test, y_pred_test)
accuracy_score_train = accuracy_score(y_train, y_pred_train)
f1_test = f1_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
roc_auc_test = roc_auc_score(y_test, y_pred_prob_test[:, 1])
    
print(f'Модель: {type(mlp_model).__name__}\nAccuracy_train: {accuracy_score_train}\nAccuracy_test: {accuracy_score_test}\nf1 score: {f1_test}\nPrecision: {precision_test}\nRecall: {recall_test}\nROC_AUC: {roc_auc_test}\n')

Модель: MLPClassifier
Accuracy_train: 0.9158163265306123
Accuracy_test: 0.8843537414965986
f1 score: 0.5142857142857143
Precision: 0.782608695652174
Recall: 0.3829787234042553
ROC_AUC: 0.8521836506159014



In [None]:
space=search_space,


In [None]:
with mlflow.start_run():
  best_result = fmin(
    fn=objective, 
    space=search_space,
    algo=algo,
    max_evals=32,
    trials=spark_trials)

In [None]:
print(hyperopt.space_eval(search_space, best_result))

In [None]:
def objective(params):
    classifier_type = params['type']
    del params['type']
    if classifier_type == 'mlp':
        clf = MLPClassifier(**params, shuffle = True, early_stopping = True, random_state = RANDOM_SEED)
    elif classifier_type == 'rf':
        clf = RandomForestClassifier(**params, class_weight='balanced', n_jobs=-1, random_state = RANDOM_SEED)
    elif classifier_type == 'logreg':
        clf = LogisticRegression(**params, n_jobs=-1, random_state=RANDOM_SEED)
    elif classifier_type == 'hgbc':
        clf = HistGradientBoostingClassifier(**params, class_weight = 'balanced',  scoring = 'accuracy', random_state=RANDOM_SEED)
    else:
        return 0
    accuracy = cross_val_score(clf, X_train, y_train).mean()
    
    # Because fmin() tries to minimize the objective, this function must return the negative accuracy. 
    return {'loss': -accuracy, 'status': STATUS_OK}

In [None]:
search_space = hp.choice('classifier_type', [
    {
        'type': 'mlp',
        'alpha': hp.quniform('alpha', 0, 100, 1),
        'batch_size': hp.quniform('batch_size', 10, 100, 1),
        'activation': hp.choice('activation', ['identity', 'logistic', 'tanh', 'relu']),
        'solver': hp.choice('solver', ['lbfgs', 'sgd', 'adam']),
        'learning_rate': hp.choice('learning_rate', ['constant', 'invscaling', 'adaptive']),
    },
    {
        'type': 'rf',
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 100),
        'max_depth': hp.quniform('max_depth', 2, 5, 1),
        'ccp_alpha': hp.quniform('ccp_alpha', 0.0, 100.0, 1.0),
        'criterion': hp.choice('criterion', ['gini', 'entropy']),
        'max_features': hp.choice('max_features', ['sqrt', 'log2', None])
    },
    {
        'type': 'logreg',
        'C': hp.lognormal('LR_C', 0, 1.0),
        'solver': hp.choice('solver', ['liblinear', 'lbfgs'])
    },

    {
        'type': 'hgbc',
        'max_depth': hp.quniform('max_depth', 2, 5, 1),
        'learning_rate': hp.quniform('learning_rate', 0, 1, 0.1),
       'l2_regularization': hp.quniform('l2_regularization', 0, 1, 0.1),
    },
])

In [None]:
algo=tpe.suggest

SparkTrials takes 2 optional arguments:

parallelism: Number of models to fit and evaluate concurrently. The default is the number of available Spark task slots.
timeout: Maximum time (in seconds) that fmin() can run. The default is no maximum time limit.

In [None]:
spark_trials = SparkTrials(parallelism = 10)

In [None]:
with mlflow.start_run():
  best_result = fmin(
    fn=objective, 
    space=search_space,
    algo=algo,
    max_evals=32,
    trials=spark_trials)

In [None]:
print(hyperopt.space_eval(search_space, best_result))