In [50]:
import pandas as pd
import numpy as np
import pandas_profiling 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder ,OneHotEncoder 
from sklearn.model_selection import cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier 

import lightgbm as lgb

from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier,CatBoostRegressor

from sklearn.model_selection import train_test_split

%matplotlib inline

In [6]:
path = 'train_LZdllcl.csv'
path_t = 'test_2umaH9m.csv'

df = pd.read_csv(path)
test = pd.read_csv(path_t)

df.head(2)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0


In [7]:
pd.DataFrame(df.dtypes).T

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,int64,object,object,object,object,object,int64,int64,float64,int64,int64,int64,int64,int64


#### Null values

In [8]:
print(df.isnull().sum()[df.isnull().sum() > 0]/df.shape[0]*100)
print(test.isnull().sum()[test.isnull().sum() > 0]/df.shape[0]*100)

education               4.395344
previous_year_rating    7.524449
dtype: float64
education               1.886586
previous_year_rating    3.306087
dtype: float64


#### Handling Null values

- Dropping 6148 Null value rows

In [9]:
# df.dropna(axis = 0,how = 'any',inplace = True)
# test.dropna(axis = 0,how = 'any',inplace = True)

# df.fillna(method = 'bfill',inplace = True)
# df.fillna(method = 'ffill',inplace = True)
# test.fillna(method = 'bfill',inplace = True)
# test.fillna(method = 'ffill',inplace = True)

In [10]:
df.shape

(54808, 14)

In [11]:
df._get_numeric_data().head(2)

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,1,35,5.0,8,1,0,49,0
1,65141,1,30,5.0,4,0,0,60,0


### SUMMARY

#### Number of observations 	54808
#### Missing cells 	6533 (0.9%)

#### Columns 
> Numeric 	6

> Categorical 	5

> Boolean 	3


> #### Nominal - To be fixed! 
  - Department : 9 variables : 2% - 30%
  - Education : 4 values : [70%,27%,1.5%]
  - Gender : Map binary : unbalanced || 1:2
  - is_promoted : Map binary
  - KPIs_met_>80% : Binary || 1:2
  - no_of_ratings : 10 values || might have to bin
  - recruitment_channel : 3 values 
  - region : 34 
  
  
* region - 34 (ordinal)
* recruitment_channel - 3 (dummy) ::: Done
* previous_year_rating - 5 ordinal( categories)
* no_of_trainings -  10 ordinal (4 major ratings)
* length_of_service - 35 ordinal 
* KPIs_met_>80% - binary
* is_promoted - binary 
* gender - binary
* education - 3 nominal (unbalanced) ::: Done
* df_index - 48660 (high cardinality)
* department - 9 categorical
* awards_won? - binary (unnbalanced)
* age - ordinal (41)
  

#### Handling recruitment channel

In [12]:
df = pd.get_dummies(data = df,columns= ['recruitment_channel','education'],drop_first= True)
test = pd.get_dummies(data = test,columns= ['recruitment_channel','education'],drop_first= True)

# Handling Features

#### removing spaces from column names

In [13]:
df.columns = df.columns.str.replace(' ','_')

test.columns = test.columns.str.replace(' ','_')

df.columns

Index(['employee_id', 'department', 'region', 'gender', 'no_of_trainings',
       'age', 'previous_year_rating', 'length_of_service', 'KPIs_met_>80%',
       'awards_won?', 'avg_training_score', 'is_promoted',
       'recruitment_channel_referred', 'recruitment_channel_sourcing',
       'education_Below_Secondary', 'education_Master's_&_above'],
      dtype='object')

#### Mapping Gender

In [14]:
df.gender = df.gender.map({'f':1,'m':0})

test.gender = test.gender.map({'f':1,'m':0})

#### Department column fix

In [15]:
df.department.value_counts()

Sales & Marketing    16840
Operations           11348
Technology            7138
Procurement           7138
Analytics             5352
Finance               2536
HR                    2418
Legal                 1039
R&D                    999
Name: department, dtype: int64

In [16]:
df.head(2)

Unnamed: 0,employee_id,department,region,gender,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met_>80%,awards_won?,avg_training_score,is_promoted,recruitment_channel_referred,recruitment_channel_sourcing,education_Below_Secondary,education_Master's_&_above
0,65438,Sales & Marketing,region_7,1,1,35,5.0,8,1,0,49,0,0,1,0,1
1,65141,Operations,region_22,0,1,30,5.0,4,0,0,60,0,0,0,0,0


In [17]:
le =  LabelEncoder()

df1 = df[['department','region']].apply(le.fit_transform)
df1.head(2)

test1 = test[['department','region']].apply(le.fit_transform)
test1.head(2)

Unnamed: 0,department,region
0,8,18
1,2,28


In [18]:
df2  = df[df.columns.difference(['department','region','employee_id'])]
df2.head(2)

test2  = test[test.columns.difference(['department','region','employee_id'])]
test2.head(2)

Unnamed: 0,KPIs_met_>80%,age,avg_training_score,awards_won?,education_Below_Secondary,education_Master's_&_above,gender,length_of_service,no_of_trainings,previous_year_rating,recruitment_channel_referred,recruitment_channel_sourcing
0,1,24,77,0,0,0,0,1,1,,0,1
1,0,31,51,0,0,0,1,5,1,3.0,0,0


In [19]:
df_f = df1.merge(df2,left_index = True,right_index = True)

test_f = test1.merge(test2,left_index = True,right_index = True)

test_f.head(2)

Unnamed: 0,department,region,KPIs_met_>80%,age,avg_training_score,awards_won?,education_Below_Secondary,education_Master's_&_above,gender,length_of_service,no_of_trainings,previous_year_rating,recruitment_channel_referred,recruitment_channel_sourcing
0,8,18,1,24,77,0,0,0,0,1,1,,0,1
1,2,28,0,31,51,0,0,0,1,5,1,3.0,0,0


In [20]:
X = df_f[df_f.columns.difference(['is_promoted'])]
Y = df_f.is_promoted

# X = X.astype('int')

In [53]:
x_train, x_test, y_train, y_test = train_test_split(X,Y,stratify=Y,random_state=0)

In [21]:
for x in X.columns:
    print(x,':::',X[x].unique(),'\n')

KPIs_met_>80% ::: [1 0] 

age ::: [35 30 34 39 45 31 33 28 32 49 37 38 41 27 29 26 24 57 40 42 23 59 44 50
 56 20 25 47 36 46 60 43 22 54 58 48 53 55 51 52 21] 

avg_training_score ::: [49 60 50 73 85 59 63 83 54 77 80 84 51 46 75 57 70 68 79 44 72 61 48 58
 87 47 52 88 71 65 62 53 78 91 82 69 55 74 86 90 92 67 89 56 76 81 45 64
 39 94 93 66 95 42 96 40 99 43 97 41 98] 

awards_won? ::: [0 1] 

department ::: [7 4 8 0 6 5 1 2 3] 

education_Below_Secondary ::: [0 1] 

education_Master's_&_above ::: [1 0] 

gender ::: [1 0] 

length_of_service ::: [ 8  4  7 10  2  5  6  1  3 16  9 11 26 12 17 14 13 19 15 23 18 20 22 25
 28 24 31 21 29 30 34 27 33 32 37] 

no_of_trainings ::: [ 1  2  3  4  7  5  6  8 10  9] 

previous_year_rating ::: [ 5.  3.  1.  4. nan  2.] 

recruitment_channel_referred ::: [0 1] 

recruitment_channel_sourcing ::: [1 0] 

region ::: [31 14 10 15 18 11 12 27  0 28 21 24  6  5  2 29 20  8  4  7 17  1 19 23
  3 13 32 25 30 26 16 22 33  9] 



#### This is unbalanced class

In [22]:
Y.value_counts()

0    50140
1     4668
Name: is_promoted, dtype: int64

> # LGBM

In [54]:
def run_lgb(X_train, X_test, y_train, y_test, test_df):
    params = {
        "objective" : "binary",
       "n_estimators":10000,
       "reg_alpha" : 0.1,
       "reg_lambda":0.1,
       "n_jobs":-1,
       "colsample_bytree":.8,
       "min_child_weight":8,
       "subsample":0.8715623,
       "min_data_in_leaf":100,
       "nthread":4,
       "metric" : "f1",
       "num_leaves" : 600,
       "learning_rate" : 0.01,
       "verbosity" : -1,
       "seed": 120,
       "max_bin":60,
       'max_depth':15,
       'min_gain_to_split':.0222415,
       'scale_pos_weight':2
    }
    
    lgtrain = lgb.Dataset(X_train, label=y_train)
    lgval = lgb.Dataset(X_test, label=y_test)
    evals_result = {}
    
    model = lgb.train(params, lgtrain, 10000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=100, 
                      evals_result=evals_result,feval=lgb_f1_score)
    
    pred_test_y = model.predict(test_df, num_iteration= model.best_iteration)
    return pred_test_y, model, evals_result

In [55]:
pred_test, model, evals_result = run_lgb(x_train, x_test, y_train, y_test, test_f)
print("LightGBM Training Completed...")

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Training until validation scores don't improve for 100 rounds.


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[100]	training's f1: 0.350118	valid_1's f1: 0.350035
[200]	training's f1: 0.496944	valid_1's f1: 0.486624
[300]	training's f1: 0.551024	valid_1's f1: 0.516867
[400]	training's f1: 0.578643	valid_1's f1: 0.524897
[500]	training's f1: 0.593928	valid_1's f1: 0.525589
[600]	training's f1: 0.609624	valid_1's f1: 0.526496
[700]	training's f1: 0.624378	valid_1's f1: 0.528281
[800]	training's f1: 0.638003	valid_1's f1: 0.527027
Early stopping, best iteration is:
[730]	training's f1: 0.630227	valid_1's f1: 0.529213
LightGBM Training Completed...


In [67]:
a = np.where(pred_test >= 0.5 ,1,0)
a

array([0, 0, 0, ..., 0, 0, 0])

In [72]:
final_ = test_f.merge(pd.DataFrame(a,columns = ['is_promoted']),left_index=True,right_index=True)
final_.head(2)

Unnamed: 0,department,region,KPIs_met_>80%,age,avg_training_score,awards_won?,education_Below_Secondary,education_Master's_&_above,gender,length_of_service,no_of_trainings,previous_year_rating,recruitment_channel_referred,recruitment_channel_sourcing,is_promoted
0,8,18,1,24,77,0,0,0,0,1,1,,0,1,0
1,2,28,0,31,51,0,0,0,1,5,1,3.0,0,0,0


##### -------------------------------------------------------------

#### Manual tuning

In [23]:
# params1 = dict(max_depth= 8,learning_rate = .0941, num_leaves= 17, reg_alpha=3.4492 ,reg_lambda= 0.0422,n_estimators= [197,198]) 
# params1

In [45]:
lgbm = lgb.LGBMClassifier(bagging_freq= 3, colsample_bytree= 0.8402369390200294, learning_rate= 0.23435231487968802, max_depth= 4, 
                          min_data_in_leaf= 263, min_sum_hessian_in_leaf= 7, num_leaves= 34, reg_alpha= 0.6663319478582055, 
                          reg_lambda= 0.700910256287464, scale_pos_weight= 2.123471494318405, subsample= 0.8114357714154377, subsample_for_bin= 4)
lgbm.fit(X,Y)

LGBMClassifier(bagging_freq=3, boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.8402369390200294, importance_type='split',
        learning_rate=0.23435231487968802, max_depth=4,
        min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=263,
        min_split_gain=0.0, min_sum_hessian_in_leaf=7, n_estimators=100,
        n_jobs=-1, num_leaves=34, objective=None, random_state=None,
        reg_alpha=0.6663319478582055, reg_lambda=0.700910256287464,
        scale_pos_weight=2.123471494318405, silent=True,
        subsample=0.8114357714154377, subsample_for_bin=4,
        subsample_freq=0)

- Keeping Macx_dept the uncontrolled

In [25]:
lgbc1 = lgb.LGBMClassifier(max_depth = -1,subsample = 0.9,min_child_weight=0.0008,reg_alpha=2.8,reg_lambda=0.041,
                           colsample_bytree =0.75,min_child_samples = 17,scale_pos_weight = 3,subsample_for_bin=200000,
                           subsample_freq =1 ,num_leaves = 27,learning_rate = 0.07,
                           objective = 'binary',class_weight = None)

lgbm_cv1 = GridSearchCV(lgbc1,param_grid = dict(n_estimators= np.arange(100,110,1)),cv= 5,n_jobs= -1,verbose = True,scoring='f1')

In [26]:
lgbm_cv1.fit(X,Y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   58.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.75,
        importance_type='split', learning_rate=0.07, max_depth=-1,
        min_child_samples=17, min_child_weight=0.0008, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=27, objective='binary',
        random_state=None, reg_alpha=2.8, reg_lambda=0.041,
        scale_pos_weight=3, silent=True, subsample=0.9,
        subsample_for_bin=200000, subsample_freq=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=True)

In [27]:
print(lgbm_cv1.best_params_)
print(lgbm_cv1.best_score_)

{'n_estimators': 104}
0.5256040820072326


#### Taking the optimized model and performing CV to check for early stopping 
#### Also validating if `n_iter` from early stopping is matching from gridsearchCV  

In [28]:
lgbm_cv1.best_estimator_.get_params()

dict_ = {'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.75,
 'importance_type': 'split',
 'learning_rate': 0.07,
 'max_depth': -1,
 'min_child_samples': 17,
 'min_child_weight': 0.0008,
 'min_split_gain': 0.0,
 'n_jobs': -1,
 'num_leaves': 27,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 2.8,
 'reg_lambda': 0.041,
 #'silent': True,
 'subsample': 0.9,
 'subsample_for_bin': 200000,
 'subsample_freq': 1,
 'scale_pos_weight': 3,
 'metric' : 'lgb_f1_score'
        }


In [29]:
?lgb.cv
# cv_results

In [30]:
from sklearn.metrics import f1_score

# this is a UDF being put in f_eval
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

d_train = lgb.Dataset(X, label=Y)
cv_results = lgb.cv(params = dict_,train_set = d_train,num_boost_round = 150,nfold = 5,verbose_eval=10, early_stopping_rounds=30,feval=lgb_f1_score)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

[10]	cv_agg's f1: 0.171154 + 0.0102025
[20]	cv_agg's f1: 0.339194 + 0.0137706
[30]	cv_agg's f1: 0.413441 + 0.0223953
[40]	cv_agg's f1: 0.494107 + 0.0206744
[50]	cv_agg's f1: 0.512751 + 0.0175024
[60]	cv_agg's f1: 0.519095 + 0.0161167
[70]	cv_agg's f1: 0.522491 + 0.0149733
[80]	cv_agg's f1: 0.526031 + 0.0160124
[90]	cv_agg's f1: 0.528588 + 0.0141994
[100]	cv_agg's f1: 0.528632 + 0.013872
[110]	cv_agg's f1: 0.529218 + 0.0143752
[120]	cv_agg's f1: 0.528355 + 0.0146016
[130]	cv_agg's f1: 0.528198 + 0.0138394
[140]	cv_agg's f1: 0.528515 + 0.0138166


In [31]:
print('Current parameters:\n', dict_)
print('\nBest num_boost_round:', len(cv_results['f1-mean']))
print('Best CV score:', cv_results['f1-mean'][-1])

Current parameters:
 {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.75, 'importance_type': 'split', 'learning_rate': 0.07, 'max_depth': -1, 'min_child_samples': 17, 'min_child_weight': 0.0008, 'min_split_gain': 0.0, 'n_jobs': -1, 'num_leaves': 27, 'objective': 'binary', 'random_state': None, 'reg_alpha': 2.8, 'reg_lambda': 0.041, 'subsample': 0.9, 'subsample_for_bin': 200000, 'subsample_freq': 1, 'scale_pos_weight': 3, 'metric': 'lgb_f1_score'}

Best num_boost_round: 111
Best CV score: 0.5300998379779701


#### handling missing in test

In [32]:
# test_f.previous_year_rating.fillna(method = 'bfill',inplace = True)

In [44]:
pred = lgbm_cv1.best_estimator_.predict(test_f)  # lgbm_cv1.best_estimator_
pd.Series(pred).value_counts()

0    23120
1      370
dtype: int64

In [73]:
df = pd.DataFrame(a,columns=['is_promoted'],index=test_f.index)
df.head(2),df.shape

(   is_promoted
 0            0
 1            0, (23490, 1))

In [74]:
final = df[['is_promoted']].merge(test[['employee_id']],left_index=True,right_index=True)
final.head(2);final.shape;final.count()

is_promoted    23490
employee_id    23490
dtype: int64

In [78]:
final.to_csv('final.csv')

# ---------------------------------OPTIMIZATION TECH --------------------------------

#### Bayesian optimizer - last line not working

* - https://www.kaggle.com/sz8416/simple-bayesian-optimization-for-lightgbm

# Method - Hyperopt

#### Mahir

In [40]:
# Hyperparameter tuning
from sklearn.model_selection import StratifiedKFold
import hyperopt
import lightgbm as lgb
from hyperopt import STATUS_OK
from hyperopt import hp


param_dict={}

space = {'max_depth': hp.choice('max_depth', np.arange(-1, 15,dtype=int)),
         'min_data_in_leaf': hp.choice('min_data_in_leaf', np.arange(10, 400,dtype=int)), #min_child_weight
         'min_sum_hessian_in_leaf': hp.choice('min_sum_hessian_in_leaf', np.arange(0, 15,dtype=int)),
         'num_leaves': hp.choice('num_leaves', np.arange(2, 50, dtype=int)),
         'bagging_freq': hp.choice('bagging_freq', np.arange(1, 20, dtype=int)),
         'subsample': hp.uniform('subsample', 0, 1),
         'colsample_bytree': hp.uniform('colsample_bytree', 0, 1),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.4),
         

        'reg_lambda': hp.uniform('reg_lambda', 0, 1),
        'reg_alpha': hp.uniform('reg_alpha', 1e-9, 5.0),
        'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
        'subsample_for_bin':  hp.choice('subsample_for_bin', np.arange(200000, 500000,50000,dtype=int))
        }


def objective(space):
    
    def lgb_f1_score(y_hat, data):
        y_true = data.get_label()
        y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
        return 'f1', f1_score(y_true, y_hat), True

    params = {
        'min_sum_hessian_in_leaf': space['min_sum_hessian_in_leaf'],
        'min_data_in_leaf':space['min_data_in_leaf'],
            'num_leaves':space['num_leaves'],
             'subsample': space['subsample'],
            'colsample_bytree': space['colsample_bytree'],
            'learning_rate':space['learning_rate'],
            #'silent': 1,
            'verbose_eval': True,
            "objective":"binary",
        'device':'cpu',
        "boosting":"gbdt",
        'max_depth':space['max_depth'],
        'bagging_freq':space['bagging_freq'],
         # 'metric':"auc",
        'boost_from_average':False,
        
        
        'lambda_l2' : space['reg_lambda'],
        'lambda_l1': space['reg_alpha'],
        'scale_pos_weight': space['scale_pos_weight'],
        'bin_construct_sample_cnt': space['subsample_for_bin']
    }
    
    skf = StratifiedKFold(
        n_splits=5,
        shuffle=True,
        random_state=42)
    
    lgtrain = lgb.Dataset(X, label=Y)
    
    cv = lgb.cv(params,
                lgtrain,
                nfold=5,
                #metrics='auc',
                num_boost_round=300,
                feval = lgb_f1_score,
                early_stopping_rounds=60,stratified=True,verbose_eval=50,folds=skf.split(X,Y))
    
    au = (cv['f1-mean'][-1])
    
#     params['n_estimators']=len(cv['f1-mean'])
#     param_dict[au]=params
    
#     pickle.dump(param_dict,open('params','wb'))
#     print(params, file=open("output_lgb.txt", "a"))
#     print(params)
#     print('max='+str(max(param_dict.keys())))
#     print ('auc = ', au)
#     print(au, file=open("output_lgb.txt", "a"))
    return{'loss': -au, 'status': STATUS_OK}

from hyperopt import Trials
from hyperopt import tpe
trials = Trials()

best = hyperopt.fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10,
            trials=trials
            )

print('\n\n',best)

# fName = open('trials.pkl', 'w')
# pickle.dump(trials, fName)
# fName.close()

  0%|                                                                             | 0/10 [00:00<?, ?it/s, best loss: ?]

  'precision', 'predicted', average, warn_for)

  'precision', 'predicted', average, warn_for)



[50]	cv_agg's binary_logloss: 0.394891 + 0.010226	cv_agg's f1: 0.347639 + 0.00711476                                   
[100]	cv_agg's binary_logloss: 0.363241 + 0.00490647	cv_agg's f1: 0.372621 + 0.00751347                                
[150]	cv_agg's binary_logloss: 0.347845 + 0.00468424	cv_agg's f1: 0.383484 + 0.0107467                                 
[200]	cv_agg's binary_logloss: 0.337426 + 0.00693028	cv_agg's f1: 0.397212 + 0.0119205                                 
[250]	cv_agg's binary_logloss: 0.334392 + 0.00597797	cv_agg's f1: 0.395266 + 0.00891619                                
[300]	cv_agg's binary_logloss: 0.327789 + 0.00412723	cv_agg's f1: 0.405952 + 0.0107151                                 
[50]	cv_agg's binary_logloss: 0.372755 + 0.00359656	cv_agg's f1: 0.388369 + 0.0086196                                  
[50]	cv_agg's binary_logloss: 0.388489 + 0.00385152	cv_agg's f1: 0.315029 + 0.00726087                                 
[100]	cv_agg's binary_logloss: 0.378946 

  'precision', 'predicted', average, warn_for)



[50]	cv_agg's binary_logloss: 0.290891 + 0.00464723	cv_agg's f1: 0.3758 + 0.012028                                     
[100]	cv_agg's binary_logloss: 0.260648 + 0.00309569	cv_agg's f1: 0.424827 + 0.0134322                                 
[150]	cv_agg's binary_logloss: 0.244389 + 0.00335545	cv_agg's f1: 0.474534 + 0.0177908                                 
[200]	cv_agg's binary_logloss: 0.236454 + 0.00215288	cv_agg's f1: 0.48819 + 0.00796182                                 
[250]	cv_agg's binary_logloss: 0.233922 + 0.00347921	cv_agg's f1: 0.495855 + 0.00656412                                
[300]	cv_agg's binary_logloss: 0.228461 + 0.0053838	cv_agg's f1: 0.502126 + 0.00960733                                 
[50]	cv_agg's binary_logloss: 0.248226 + 0.00437311	cv_agg's f1: 0.433881 + 0.0210249                                  
[100]	cv_agg's binary_logloss: 0.230015 + 0.00401281	cv_agg's f1: 0.47213 + 0.0118795                                  
[150]	cv_agg's binary_logloss: 0.225611 

  'precision', 'predicted', average, warn_for)

  'precision', 'predicted', average, warn_for)

  'precision', 'predicted', average, warn_for)

  'precision', 'predicted', average, warn_for)

  'precision', 'predicted', average, warn_for)

  'precision', 'predicted', average, warn_for)

  'precision', 'predicted', average, warn_for)

  'precision', 'predicted', average, warn_for)

  'precision', 'predicted', average, warn_for)

  'precision', 'predicted', average, warn_for)

  'precision', 'predicted', average, warn_for)

  'precision', 'predicted', average, warn_for)

  'precision', 'predicted', average, warn_for)

  'precision', 'predicted', average, warn_for)



[50]	cv_agg's binary_logloss: 0.197348 + 0.00241361	cv_agg's f1: 0.42949 + 0.0124872                                   
[100]	cv_agg's binary_logloss: 0.190003 + 0.00218463	cv_agg's f1: 0.465994 + 0.00677534                                
[150]	cv_agg's binary_logloss: 0.186536 + 0.00313666	cv_agg's f1: 0.480749 + 0.00748171                                
[200]	cv_agg's binary_logloss: 0.185346 + 0.00285465	cv_agg's f1: 0.48755 + 0.00521149                                 
[250]	cv_agg's binary_logloss: 0.186076 + 0.00325634	cv_agg's f1: 0.491151 + 0.0098517                                 
[50]	cv_agg's binary_logloss: 0.23337 + 0.00279859	cv_agg's f1: 0.477728 + 0.00836769                                  
[100]	cv_agg's binary_logloss: 0.226965 + 0.00305175	cv_agg's f1: 0.473933 + 0.00822294                                
[50]	cv_agg's binary_logloss: 0.400028 + 0.00153996	cv_agg's f1: 0.246291 + 0.0104913                                  
[50]	cv_agg's binary_logloss: 0.241486 +

In [None]:
trials.trials

In [None]:
trials.results

In [None]:
trials.vals

In [None]:
break1