In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

## Download the data

In [1]:
!kaggle competitions download -c playground-series-s3e18

Downloading playground-series-s3e18.zip to f:\kaggle_competitions\Multi-label classification of enzyme




  0%|          | 0.00/2.41M [00:00<?, ?B/s]
 41%|████▏     | 1.00M/2.41M [00:03<00:05, 267kB/s]
 83%|████████▎ | 2.00M/2.41M [00:06<00:01, 361kB/s]
100%|██████████| 2.41M/2.41M [00:07<00:00, 375kB/s]
100%|██████████| 2.41M/2.41M [00:07<00:00, 354kB/s]


## Loading the data

In [2]:
train=pd.read_csv('train.csv')

In [3]:
original=pd.read_csv('mixed_desc.csv')

In [4]:
original['EC1_EC2_EC3_EC4_EC5_EC6']

0       1_1_1_1_0_1
1       1_1_1_1_0_1
2       1_1_1_1_0_1
3       0_1_1_0_0_0
4       1_1_1_1_0_1
           ...     
1034    0_1_0_0_0_1
1035    1_1_0_0_0_0
1036    0_1_1_0_0_0
1037    0_1_1_0_0_0
1038    0_1_0_1_0_0
Name: EC1_EC2_EC3_EC4_EC5_EC6, Length: 1039, dtype: object

In [5]:
col='EC1_EC2_EC3_EC4_EC5_EC6'

for x in col.split('_'):
    original[x]=original[col].apply(lambda x: int(x.split('_')[0]))

In [6]:
train.columns

Index(['id', 'BertzCT', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3v',
       'Chi4n', 'EState_VSA1', 'EState_VSA2', 'ExactMolWt', 'FpDensityMorgan1',
       'FpDensityMorgan2', 'FpDensityMorgan3', 'HallKierAlpha',
       'HeavyAtomMolWt', 'Kappa3', 'MaxAbsEStateIndex', 'MinEStateIndex',
       'NumHeteroatoms', 'PEOE_VSA10', 'PEOE_VSA14', 'PEOE_VSA6', 'PEOE_VSA7',
       'PEOE_VSA8', 'SMR_VSA10', 'SMR_VSA5', 'SlogP_VSA3', 'VSA_EState9',
       'fr_COO', 'fr_COO2', 'EC1', 'EC2', 'EC3', 'EC4', 'EC5', 'EC6'],
      dtype='object')

In [7]:
len(train.columns)

38

In [8]:
train.drop(['id'],axis=1,inplace=True)

In [9]:
original=original[train.columns.tolist()]

In [10]:
len(original),len(train)

(1039, 14838)

In [11]:
train_merged=pd.concat([train,original],axis=0)

In [12]:
len(train)

14838

In [13]:
# # look at it later some of the columns are highly skewed
# from ydata_profiling import ProfileReport

# profile = ProfileReport(train, title="Pandas Profiling Report")

# profile.to_notebook_iframe()

In [14]:
train.isnull().sum().sum()

0

In [15]:
train.duplicated().sum()

0

In [16]:
train_merged.duplicated().sum()

80

## Splitting labels and features

In [17]:
train_merged.drop_duplicates(inplace=True)

In [18]:
train.drop(['EC3','EC4','EC5','EC6'],axis=1,inplace=True)

In [19]:
train_merged.drop(['EC3','EC4','EC5','EC6'],axis=1,inplace=True)

In [20]:
# train.drop(['BertzCT', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3v', 'Chi4n'],axis=1,inplace=True)

In [21]:
X2=train.drop(['EC1','EC2'],axis=1)
X=train_merged.drop(['EC1','EC2'],axis=1)
y1=train_merged[['EC1']]
y2=train[['EC2']]


## Preprocessing

In [135]:
from scipy.stats import skew,yeojohnson
from scipy import stats
def preprocess(data,labels):
    ## remove the outliers with IQR and remove the labels of the outliers
    # Q1 = data.quantile(0.10)
    # Q3 = data.quantile(0.90)
    # IQR = Q3 - Q1
    # condition=~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)
    # processed_data=data[condition].reset_index(drop=True)
    # processed_labels=labels[condition].reset_index(drop=True)
    ## remove the skewness of the data
    # skewness = data.apply(lambda x: skew(x))
    # skewness = skewness[abs(skewness) > 0.5]
    # skewed_features = skewness.index
    # # yeo 
    # for feat in skewed_features:
    #     data[feat]=yeojohnson(data[feat])[0]
    
    return data,labels

In [136]:
len(X.columns)

31

In [137]:
len(X),len(y1),len(y2)

(15797, 15797, 14838)

In [138]:
X.columns

Index(['BertzCT', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3v', 'Chi4n',
       'EState_VSA1', 'EState_VSA2', 'ExactMolWt', 'FpDensityMorgan1',
       'FpDensityMorgan2', 'FpDensityMorgan3', 'HallKierAlpha',
       'HeavyAtomMolWt', 'Kappa3', 'MaxAbsEStateIndex', 'MinEStateIndex',
       'NumHeteroatoms', 'PEOE_VSA10', 'PEOE_VSA14', 'PEOE_VSA6', 'PEOE_VSA7',
       'PEOE_VSA8', 'SMR_VSA10', 'SMR_VSA5', 'SlogP_VSA3', 'VSA_EState9',
       'fr_COO', 'fr_COO2'],
      dtype='object')

In [139]:
X[['ExactMolWt', 'FpDensityMorgan1',
       'FpDensityMorgan2', 'FpDensityMorgan3', 'HallKierAlpha',
       'HeavyAtomMolWt', 'Kappa3', 'MaxAbsEStateIndex', 'MinEStateIndex',
       'NumHeteroatoms', 'PEOE_VSA10', 'PEOE_VSA14', 'PEOE_VSA6']].head(5)

Unnamed: 0,ExactMolWt,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,HallKierAlpha,HeavyAtomMolWt,Kappa3,MaxAbsEStateIndex,MinEStateIndex,NumHeteroatoms,PEOE_VSA10,PEOE_VSA14,PEOE_VSA6
0,222.06808,1.181818,1.727273,2.363636,-0.24,212.163,8.17,11.922504,0.171585,4,0.0,91.536492,0.0
1,260.029719,1.346154,2.076923,2.769231,-0.09,247.031,3.201491,10.932338,-4.83045,10,24.415866,7.822697,0.0
2,382.131027,1.085714,1.742857,2.4,-0.78,354.106,15.03389,11.238048,-5.066255,9,0.0,15.645394,0.0
3,530.070277,1.162791,1.57377,2.27027,-1.3,506.124,6.724301,11.17117,-5.276575,19,42.727765,21.335138,0.0
4,118.062994,1.444444,2.111111,2.555556,-1.1,108.056,3.931272,9.855741,-1.676296,4,6.041841,11.938611,6.923737


In [140]:
X.head(5)

Unnamed: 0,BertzCT,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3v,Chi4n,EState_VSA1,EState_VSA2,...,PEOE_VSA14,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,SMR_VSA10,SMR_VSA5,SlogP_VSA3,VSA_EState9,fr_COO,fr_COO2
0,323.390782,9.879918,5.875576,5.875576,4.304757,4.304757,2.754513,1.749203,0.0,11.938294,...,91.536492,0.0,0.0,0.0,17.744066,0.0,4.794537,35.527357,0,0
1,273.723798,7.259037,4.441467,5.834958,3.285046,4.485235,2.201375,1.289775,45.135471,0.0,...,7.822697,0.0,0.0,0.0,7.822697,30.705892,13.825658,44.70731,0,0
2,521.643822,10.911303,8.527859,11.050864,6.665291,9.519706,5.824822,1.770579,15.645394,6.606882,...,15.645394,0.0,53.378235,0.0,15.645394,73.143616,17.964475,45.66012,0,0
3,567.431166,12.453343,7.089119,12.833709,6.478023,10.978151,7.914542,3.067181,95.639554,0.0,...,21.335138,0.0,0.0,6.420822,15.645394,62.107304,31.961948,87.509997,0,0
4,112.770735,4.414719,2.866236,2.866236,1.875634,1.875634,1.03645,0.727664,17.980451,12.841643,...,11.938611,6.923737,19.3864,0.0,11.938611,18.883484,9.589074,33.333333,2,2


In [141]:
features=X.columns

In [142]:
# categorical columns 

cat_cols=['NumHeteroatoms', 'fr_COO', 'fr_COO2']

In [143]:
numerical_cols_X=X.select_dtypes(include=['int64','float64']).columns.tolist()
numerical_cols_X2=X2.select_dtypes(include=['int64','float64']).columns.tolist()

In [144]:
numerical_cols_X

['BertzCT',
 'Chi1',
 'Chi1n',
 'Chi1v',
 'Chi2n',
 'Chi2v',
 'Chi3v',
 'Chi4n',
 'EState_VSA1',
 'EState_VSA2',
 'ExactMolWt',
 'FpDensityMorgan1',
 'FpDensityMorgan2',
 'FpDensityMorgan3',
 'HallKierAlpha',
 'HeavyAtomMolWt',
 'Kappa3',
 'MaxAbsEStateIndex',
 'MinEStateIndex',
 'NumHeteroatoms',
 'PEOE_VSA10',
 'PEOE_VSA14',
 'PEOE_VSA6',
 'PEOE_VSA7',
 'PEOE_VSA8',
 'SMR_VSA10',
 'SMR_VSA5',
 'SlogP_VSA3',
 'VSA_EState9',
 'fr_COO',
 'fr_COO2']

In [145]:
len(numerical_cols_X),len(numerical_cols_X2)

(31, 31)

In [146]:
for col in cat_cols:
    numerical_cols_X.remove(col)
    numerical_cols_X2.remove(col)

In [147]:
len(numerical_cols_X),len(cat_cols),len(numerical_cols_X2)

(28, 3, 28)

## Feature Importance

In [148]:
## Feature Importances
from sklearn.feature_selection import mutual_info_classif,chi2
from sklearn.ensemble import RandomForestClassifier
def feature_importances(Xdata,ydata):
    mi=mutual_info_classif(Xdata,ydata)
    rfc=RandomForestClassifier()
    rfc.fit(Xdata,ydata)
    return pd.DataFrame({'features':Xdata.columns,'mi':mi,'rfc':rfc.feature_importances_}).sort_values(by=['mi','rfc'],ascending=False)


In [149]:
feature_importances(X,y1.values.ravel())

Unnamed: 0,features,mi,rfc
19,NumHeteroatoms,0.05828,0.039064
18,MinEStateIndex,0.056605,0.052705
8,EState_VSA1,0.052662,0.0418
27,SlogP_VSA3,0.047201,0.031848
21,PEOE_VSA14,0.046395,0.023424
15,HeavyAtomMolWt,0.044479,0.036525
25,SMR_VSA10,0.04401,0.026908
28,VSA_EState9,0.043804,0.041079
10,ExactMolWt,0.043124,0.03629
26,SMR_VSA5,0.038005,0.029832


## Creating a pipeline

In [150]:
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler,MaxAbsScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
def get_column_transformer(num_cols):
    return ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', StandardScaler(), num_cols),
        ],remainder='passthrough')

In [151]:
def get_pipeline(model,num_cols):
    return Pipeline(steps=[('preprocessor', get_column_transformer(num_cols)),
                      ('classifier', model)])

# Model Building

In [152]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import KFold

def skf_model(model_obj,X,y,num_cols):
    skf = KFold(n_splits=5,shuffle=True,random_state=42)
    scores=[]
    i=0
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index,:], y.iloc[test_index,:]
        # preprocess
        X_train,y_train=preprocess(X_train,y_train)
        y_train=y_train.values.ravel()
        y_test=y_test.values.ravel()
        # get pipeline
        model=get_pipeline(model_obj,num_cols)
        # fit model
        model.fit(X_train,y_train)
        # predict
        y_pred=model.predict_proba(X_test)
        val_preds = np.array(y_pred)[:,1]
        score=roc_auc_score(y_test,val_preds)
        scores.append(score)
        print(f'Fold {i+1} : {score}')
        i+=1
    return np.mean(scores)

## Naive Bayes

In [153]:
from sklearn.naive_bayes import GaussianNB

skf_model(GaussianNB(),X2,y2,numerical_cols_X2)

Fold 1 : 0.5698826291079813
Fold 2 : 0.5466386554621849
Fold 3 : 0.5626685309098172
Fold 4 : 0.5583614516802793
Fold 5 : 0.5609541357557266


0.5597010805831978

## Random Forest

In [174]:
rf_params_l1={'n_estimators': 1173,
              'max_depth': 91,
              'criterion': 'entropy',
              'max_features': 'sqrt',
              'min_samples_split': 10,
              'min_samples_leaf': 18}
rf_params_l2={'n_estimators': 1371,
              'max_depth': 7,
              'criterion': 'gini',
              'max_features': 'log2',
              'min_samples_split': 2,
              'min_samples_leaf': 20}

In [175]:
from sklearn.ensemble import RandomForestClassifier

scores1=skf_model(RandomForestClassifier(random_state=42,**rf_params_l1,n_jobs=-1),X,y1,numerical_cols_X)

Fold 1 : 0.7204854141387502
Fold 2 : 0.7164103026065116
Fold 3 : 0.709592223889241
Fold 4 : 0.7093813717003118
Fold 5 : 0.7198656551288131


In [176]:
scores2=skf_model(RandomForestClassifier(random_state=42,**rf_params_l2,n_jobs=-1),X2,y2,numerical_cols_X2)

Fold 1 : 0.5955061619718309
Fold 2 : 0.5760847196021265
Fold 3 : 0.5981363116837001
Fold 4 : 0.5887067623276507
Fold 5 : 0.5733632717813976


In [177]:
# 0.6213000594420688 before removing the outliers
scores1,scores2, np.mean([scores1,scores2])

(0.7151469934927255, 0.5863594454733412, 0.6507532194830333)

## Catboost

In [168]:

cat_params_l1 = {
    'iterations': 150,
    'depth': 8,
    'learning_rate': 0.0583328291135858,
    'l2_leaf_reg': 1.718355353927,
    'random_strength': 0.0364936343793558,
    'od_type': "Iter", 
    'od_wait': 38,
    'bootstrap_type': "Bayesian",
    'grow_policy': 'Lossguide',
    'bagging_temperature': 4.81608188901775,
    'eval_metric': 'AUC', # AUC
    'loss_function': 'Logloss',
    #'random_seed': 42,
    #'auto_class_weights': 'Balanced',
    'verbose': False
}
# tuned somewhat
cat_params_l2 = {'iterations': 2052,
                 'depth': 3,
                 'learning_rate': 0.005860280381371149,
                 'l2_leaf_reg': 3.915641769216947,
                 'random_strength': 4.589849045106614,
                 'od_type': 'IncToDec',
                 'od_wait': 24,
                 'bootstrap_type': 'Bernoulli',
                 'verbose': False, # False
                 'grow_policy': 'Depthwise',
                 'loss_function':'Logloss'}

In [169]:
from catboost import CatBoostClassifier

scores1=skf_model(CatBoostClassifier(**cat_params_l1),X,y1,numerical_cols_X)

Fold 1 : 0.7186869902489976
Fold 2 : 0.7105908265488873
Fold 3 : 0.709094246352953
Fold 4 : 0.7060278569511769
Fold 5 : 0.7165366117997698


In [170]:
from catboost import CatBoostClassifier
scores2=skf_model(CatBoostClassifier(**cat_params_l2,random_state=42),X2,y2,numerical_cols_X2)

Fold 1 : 0.5929738849765258
Fold 2 : 0.5819420625393015
Fold 3 : 0.5969676920003995
Fold 4 : 0.5842326750428632
Fold 5 : 0.577475424677901


In [171]:
scores2

0.5867183478473982

In [173]:
scores1,scores2, np.mean([scores1,scores2])

(0.712187306380357, 0.5867183478473982, 0.6494528271138775)

## Light GBM

In [154]:
lgbm_params_l1 = {'n_estimators': 786,
                'learning_rate': 0.008022202426311022,
                'num_leaves': 212,
                'max_depth': 9,
                'min_child_samples': 105,
                'min_child_weight': 15,
                'subsample': 0.6241910706861232,
                'colsample_bytree': 0.10092252740280591,
                'reg_alpha': 0.0898095398969116,
                'reg_lambda': 0.04729441023604163,#'n_jobs':-1}
}

lgbm_params_l2={'n_estimators': 186,
                'learning_rate': 0.00458267933206545,
                'num_leaves': 231,
                'max_depth': 53, 
                'min_child_samples': 48,
                'min_child_weight': 40,
                'subsample': 0.872467055832323,
                'colsample_bytree': 0.19672292565509458,
                'reg_alpha': 0.013519705605919736, 
                'reg_lambda': 0.05393252179886955
                #'n_jobs':-1}
}


# # #new 
#lgbm_params_l1={'n_estimators': 854, 'learning_rate': 0.004534684299532389, 'num_leaves': 237, 'max_depth': 177, 'min_child_samples': 86, 'min_child_weight': 39, 'subsample': 0.6297745985460238, 'colsample_bytree': 0.16624671663999285, 'reg_alpha': 0.4147866848224776, 'reg_lambda': 0.18650235966893608}
#lgbm_params_l2={'n_estimators': 100, 'learning_rate': 0.044606202448458206, 'num_leaves': 41, 'max_depth': 207, 'min_child_samples': 149, 'min_child_weight': 190, 'subsample': 0.5158023863040262, 'colsample_bytree': 0.2641464920307482, 'reg_alpha': 0.12078683707184015, 'reg_lambda': 0.3067449049533006}
    

In [155]:
from lightgbm import LGBMClassifier

scores1=skf_model(LGBMClassifier(n_jobs=-1,random_state=42,**lgbm_params_l1),X,y1,numerical_cols_X)

Fold 1 : 0.7219381522814583
Fold 2 : 0.7151101238681687
Fold 3 : 0.7128226935477252
Fold 4 : 0.711887703838689
Fold 5 : 0.7217735780893675


In [156]:
scores1

0.7167064503250817

In [157]:
scores2=skf_model(LGBMClassifier(n_jobs=-1,random_state=42,**lgbm_params_l2),X2,y2,numerical_cols_X2)

Fold 1 : 0.5982174295774648
Fold 2 : 0.5793478105527925
Fold 3 : 0.5931400290733158
Fold 4 : 0.5858142881053414
Fold 5 : 0.5705330086617542


In [159]:
# 0.6310383278268565 before removing the outliers
scores1,scores2, np.mean([scores1,scores2])

(0.7167064503250817, 0.5854105131941337, 0.6510584817596077)

In [109]:
# creating model

lgbm_l1=LGBMClassifier(random_state=42,**lgbm_params_l1)

# preprocess
x_train,y_train=preprocess(X,y1)
y_train=y_train.values.ravel()
# get pipeline
lgbm_l1=get_pipeline(lgbm_l1,numerical_cols_X)
# fit model
lgbm_l1.fit(x_train,y_train)

In [110]:
# creating model

lgbm_l2=LGBMClassifier(random_state=42,**lgbm_params_l2)
# preprocess
x_train,y_train=preprocess(X2,y2)
y_train=y_train.values.ravel()
# get pipeline
lgbm_l2=get_pipeline(lgbm_l2,numerical_cols_X2)
# fit model
lgbm_l2.fit(x_train,y_train)

In [111]:
len(X2.columns),len(X.columns)

(31, 31)

## XGBoost

In [160]:
xgb_params_l1 = {'n_estimators': 988,
                 'learning_rate': 0.010098413045397634,
                 'lambda': 0.002140241524002415,
                 'alpha': 0.001208993581304336,
                 'subsample': 0.769433783780473,
                 'colsample_bytree': 0.14579274709142026,
                 'max_depth': 85,
                 'min_child_weight': 46,
                 'eta': 0.0010503096218264492,
                 'gamma': 0.13763171873234623, 
                 'scale_pos_weight': 71,
                 'grow_policy': 'lossguide',
                 #'n_jobs':-1}
}

#old
# xgb_params_l2={'n_estimators': 625,
#                'learning_rate': 0.003762652655389914,
#                'lambda': 0.016968894970899047,
#                'alpha': 0.0016184051167205714,
#                'subsample': 0.6500284901379405,
#                'colsample_bytree': 0.23032639126444657,
#                'max_depth': 45,
#                'min_child_weight': 97,
#                'eta': 0.03498419914709866,
#                'gamma': 0.8231225832500532,
#                'scale_pos_weight': 42,
#                'grow_policy': 'lossguide',
#                #'n_jobs':-1}
# }
xgb_params_l2={'n_estimators': 240,
               'learning_rate': 0.006496592390654572,
               'lambda': 0.017511028649561642,
               'alpha': 0.058015038023273506,
               'subsample': 0.6034996424008736,
               'colsample_bytree': 0.2742590004249605,
               'max_depth': 25,
               'min_child_weight': 118,
               'eta': 0.028692162557239075,
               'gamma': 0.009652756679188952,
               'scale_pos_weight': 44,
               'grow_policy': 'depthwise'}

In [161]:
from xgboost import XGBClassifier

scores1=skf_model(XGBClassifier(n_jobs=-1,**xgb_params_l1),X,y1,numerical_cols_X)

Fold 1 : 0.7225604317843028
Fold 2 : 0.7137142880077632
Fold 3 : 0.7142172130586857
Fold 4 : 0.7122115544920515
Fold 5 : 0.724352773826458


In [162]:
from xgboost import XGBClassifier
scores2=skf_model(XGBClassifier(n_jobs=-1,random_state=42,**xgb_params_l2),X2,y2,numerical_cols_X2)

Fold 1 : 0.5939711707746479
Fold 2 : 0.5764730892356943
Fold 3 : 0.5944515185813998
Fold 4 : 0.5889486809107953
Fold 5 : 0.5706794015817358


In [163]:
# 0.6089709505596688 before removing the outliers
scores1,scores2, np.mean([scores1,scores2])

(0.7174112522338523, 0.5849047722168546, 0.6511580122253534)

In [212]:
# creating model

xgb_l1=XGBClassifier(**xgb_params_l1)

# preprocess
x_train,y_train=preprocess(X,y1)
y_train=y_train.values.ravel()
# get pipeline
xgb_l1=get_pipeline(xgb_l1,numerical_cols_X)
# fit model
xgb_l1.fit(x_train,y_train)

In [215]:
# creating model

xgb_l2=XGBClassifier(random_state=42,**xgb_params_l2)

# preprocess
x_train,y_train=preprocess(X2,y2)
y_train=y_train.values.ravel()
# get pipeline
xgb_l2=get_pipeline(xgb_l2,numerical_cols_X2)
# fit model
xgb_l2.fit(x_train,y_train)

## Hist Gradient Boosting Classifier

In [178]:
hist_params_l1 = {'learning_rate': 0.1755720838510706,
                  'max_iter': 2461,
                  'max_depth': 189,
                  'max_bins': 175,
                  'l2_regularization': 0.7371155470044533,
                  'min_samples_leaf': 19,
                  'interaction_cst': 'pairwise',
                  'max_leaf_nodes': 7,
                  'scoring': 'roc_auc'}
# Tuned
hist_params_l2={'learning_rate': 0.0672877951220294,
                'max_iter': 2354,
                'max_depth': 28,
                'max_bins': 14,
                'l2_regularization': 0.28953194462072923,
                'min_samples_leaf': 48,
                'interaction_cst': 'no_interactions',
                'max_leaf_nodes': 3}


# old

# hist_params_l2={'learning_rate': 0.13574592393722923,
#                 'max_iter': 2939,
#                 'max_depth': 23,
#                 'max_bins': 27,
#                 'l2_regularization': 0.21218320471327906,
#                 'min_samples_leaf': 63,
#                 'interaction_cst': 'no_interactions',
#                 'max_leaf_nodes': 4,
#                 'scoring': 'loss'}

In [179]:
from sklearn.ensemble import HistGradientBoostingClassifier

scores1=skf_model(HistGradientBoostingClassifier(random_state=42,**hist_params_l1),X,y1,numerical_cols_X)

Fold 1 : 0.7115009715779029
Fold 2 : 0.7121458679690901
Fold 3 : 0.698497402946823
Fold 4 : 0.7022903128312614
Fold 5 : 0.7138391022601549


In [180]:
from sklearn.ensemble import HistGradientBoostingClassifier
scores2=skf_model(HistGradientBoostingClassifier(random_state=42,**hist_params_l2),X2,y2,numerical_cols_X2)

Fold 1 : 0.5900421801643192
Fold 2 : 0.574705596524324
Fold 3 : 0.5869782477224053
Fold 4 : 0.583563179429044
Fold 5 : 0.5688427414721797


In [182]:
# 0.6344600057018648 before removing the outliers
scores1,scores2, np.mean([scores1,scores2])

(0.7076547315170465, 0.5808263890624544, 0.6442405602897505)

In [109]:
# creating model

hist_l2=HistGradientBoostingClassifier(random_state=42,**hist_params_l2)

# preprocess
x_train,y_train=preprocess(X2,y2)
y_train=y_train.values.ravel()
# get pipeline
hist_l2=get_pipeline(hist_l2,numerical_cols_X2)
# fit model
hist_l2.fit(x_train,y_train)

## Gradient Boosting Classifier

In [84]:
gb_params_l1 = {'learning_rate': 0.003755536444677597,
                'n_estimators': 1650,
                'max_depth': 72,
                'min_samples_split': 74,
                'min_samples_leaf': 78,
                'max_features': 'log2',
                'subsample': 0.20428215357261675}

gb_params_l2={'learning_rate': 0.001689570321780246,
              'n_estimators': 769,
              'max_depth': 76,
              'min_samples_split': 86,
              'min_samples_leaf': 74,
              'max_features': 'log2',
              'subsample': 0.19812972424101377}

In [58]:
from sklearn.ensemble import GradientBoostingClassifier

scores1=skf_model(GradientBoostingClassifier(random_state=42,**gb_params_l1),X,y1,numerical_cols_X)

Fold 1 : 0.7183115001135727
Fold 2 : 0.7121755239066992
Fold 3 : 0.7077307364321642
Fold 4 : 0.7106154811756515
Fold 5 : 0.7128282417756102


In [60]:
scores2=skf_model(GradientBoostingClassifier(random_state=42,**gb_params_l2),X2,y2,numerical_cols_X2)

Fold 1 : 0.5912052523474178
Fold 2 : 0.5839335734293717
Fold 3 : 0.5904681552870157
Fold 4 : 0.5872214947474131
Fold 5 : 0.5736979336102916


In [61]:
scores1,scores2, np.mean([scores1,scores2])

(0.7123322966807396, 0.585305281884302, 0.6488187892825208)

## Voting Classifier

In [183]:
from sklearn.ensemble import HistGradientBoostingClassifier,GradientBoostingClassifier,RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
estimators_l1=[
            ('rf',RandomForestClassifier(random_state=42,**rf_params_l1)),
            ('lgbm',LGBMClassifier(random_state=42,**lgbm_params_l1)),
            ('xgb',XGBClassifier(**xgb_params_l1))]

estimators_l2=[('rf',RandomForestClassifier(random_state=42,**rf_params_l2)),
            ('cat',CatBoostClassifier(random_state=42,**cat_params_l2)),
            ('lgbm',LGBMClassifier(random_state=42,**lgbm_params_l2)),
            ('xgb',XGBClassifier(random_state=42,**xgb_params_l2))]

In [184]:

#vc_weights_l1={0.113851271566289, 0.47922230232379165, 0.2056789714580716,  0.913688205488072,  0.6028434701660855, 0.93217568007266}
#tuned
vc_weights_l1={
               0.36959886757484034,
               0.573982270131082,
               0.58028298278592476}
# tuned
vc_weights_l2=[ 
                0.14665372361915316,
                0.4606877513912636,
                0.573160391973144, 
                0.8839181476361347]

In [185]:
# import voting classifier
from sklearn.ensemble import VotingClassifier

# create the ensemble model
ensemble = VotingClassifier(estimators_l1, 
                            voting='soft',
                            weights=vc_weights_l1,
                            n_jobs=-1)

scores=skf_model(ensemble,X,y1,numerical_cols_X)

Fold 1 : 0.7225550128045974
Fold 2 : 0.7169289470492118
Fold 3 : 0.7128765043071409
Fold 4 : 0.7126198782339374
Fold 5 : 0.7221271647587437


In [186]:
scores

0.7174215014307264

In [187]:
from sklearn.ensemble import VotingClassifier
# create the ensemble model
ensemble = VotingClassifier(estimators_l2,
                            voting='soft',
                            weights=vc_weights_l2,
                            n_jobs=-1)

scores2=skf_model(ensemble,X2,y2,numerical_cols_X2)

Fold 1 : 0.5965933098591549
Fold 2 : 0.58151832161436
Fold 3 : 0.597112642453699
Fold 4 : 0.5870836574151561
Fold 5 : 0.5758633721453381


In [188]:
scores,scores2, np.mean([scores,scores2])

(0.7174215014307264, 0.5876342606975415, 0.652527881064134)

In [189]:
# creating model

vc_l1=VotingClassifier(estimators_l1, 
                       voting='soft',
                       weights=vc_weights_l1,
                       n_jobs=-1)

# preprocess
x_train,y_train=preprocess(X,y1)
y_train=y_train.values.ravel()
# get pipeline
vc_l1=get_pipeline(vc_l1,numerical_cols_X)
# fit model
vc_l1.fit(x_train,y_train)

In [190]:
# creating model

vc_l2=VotingClassifier(estimators_l2, voting='soft',
                       weights=vc_weights_l2,
                        n_jobs=-1)

# preprocess
x_train,y_train=preprocess(X2,y2)
y_train=y_train.values.ravel()
# get pipeline
vc_l2=get_pipeline(vc_l2,numerical_cols_X2)
# fit model
vc_l2.fit(x_train,y_train)

## Stacking

In [80]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
stacker=StackingClassifier(estimators_l1,final_estimator=LogisticRegression(),n_jobs=-1)

scores=skf_model(stacker,X,y1,numerical_cols_X)

Fold 1 : 0.7199060348919071
Fold 2 : 0.7174912949789154
Fold 3 : 0.7110410101830023
Fold 4 : 0.71150345158377
Fold 5 : 0.7182648061595429


In [79]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
stacker=StackingClassifier(estimators_l2,final_estimator=LogisticRegression(),n_jobs=-1)

scores2=skf_model(stacker,X2,y2,numerical_cols_X2)

Fold 1 : 0.5908898180751173
Fold 2 : 0.5846710112616476
Fold 3 : 0.5932860198408736
Fold 4 : 0.5919803989687518
Fold 5 : 0.575217582101164


In [84]:
scores,scores2,np.mean([scores,scores2])

(0.7156413195594276, 0.5872089660495108, 0.6514251428044692)

## Hyper Parameter Tuning

In [None]:
    # #params for catboost
    # params = {
    #     'loss_function': 'Logloss',
    #     'eval_metric': 'AUC',
    #     'verbose': False,
    #     'random_seed': 42,
    #     'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
    #     'iterations': trial.suggest_int('iterations', 100, 4000),
    #     'depth': trial.suggest_int('depth', 3, 10),
    #     'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    # }

In [48]:
## Hyper parameter tuning using optuna for catboost
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner
from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier,GradientBoostingClassifier,RandomForestClassifier,VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from optuna import Trial
def objective(trial: Trial):
    w1=trial.suggest_float("w1",0,1.0)
    w2=trial.suggest_float("w2",0,1.0)
    w3=trial.suggest_float("w3",0,1.0)
    w4=trial.suggest_float("w4",0,1.0)
    w5=trial.suggest_float("w5",0,1.0)
    w6=trial.suggest_float("w6",0,1.0)
    weights=[w1,w2,w3,w4,w5,w6]
    # create the ensemble model
    model_obj = VotingClassifier(estimators_l1, voting='soft',
                       weights=weights,
                        n_jobs=-1)
    skf = KFold(n_splits=5,shuffle=True,random_state=42)
    scores=[]
    i=0
    for train_index, test_index in skf.split(X, y1):
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y1.iloc[train_index,:], y1.iloc[test_index,:]
        # preprocess
        X_train,y_train=preprocess(X_train,y_train)
        y_train=y_train.values.ravel()
        y_test=y_test.values.ravel()
        # get pipeline
        model=get_pipeline(model_obj,numerical_cols_X2)
        # fit model
        model.fit(X_train,y_train)
        # predict
        y_pred=model.predict_proba(X_test)
        val_preds = np.array(y_pred)[:,1]
        score=roc_auc_score(y_test,val_preds)
        scores.append(score)
        trial.report(score, i)
        if trial.should_prune():
            raise optuna.TrialPruned()
        i+=1
    return np.mean(scores)


In [49]:
sampler = optuna.samplers.TPESampler(seed=42)
pruner = SuccessiveHalvingPruner(min_resource=1, reduction_factor=2, min_early_stopping_rate=0)
study = optuna.create_study(pruner=pruner,sampler=sampler,direction='maximize')
study.optimize(objective, n_trials=100,n_jobs=-1,show_progress_bar=True)


[I 2023-07-02 22:30:22,735] A new study created in memory with name: no-name-25a7727d-3047-4cb6-894d-d789ebe6e9ba


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-07-02 22:34:34,698] Trial 4 pruned. 
[I 2023-07-02 22:34:51,857] Trial 5 pruned. 
[I 2023-07-02 22:35:27,280] Trial 0 pruned. 
[I 2023-07-02 22:37:28,220] Trial 2 pruned. 
[I 2023-07-02 22:41:31,570] Trial 3 finished with value: 0.7149350322984679 and parameters: {'w1': 0.8121305060751821, 'w2': 0.368931669092014, 'w3': 0.7803238405132261, 'w4': 0.8587092568422987, 'w5': 0.8092576934642723, 'w6': 0.6393101936281794}. Best is trial 3 with value: 0.7149350322984679.
[I 2023-07-02 22:41:52,594] Trial 7 finished with value: 0.7148680342579115 and parameters: {'w1': 0.264550888662334, 'w2': 0.782040355693804, 'w3': 0.05322669877126618, 'w4': 0.5343927189924328, 'w5': 0.8216300398937468, 'w6': 0.6862951033166491}. Best is trial 3 with value: 0.7149350322984679.
[I 2023-07-02 22:42:09,276] Trial 8 pruned. 
[I 2023-07-02 22:42:32,505] Trial 9 pruned. 
[I 2023-07-02 22:42:54,755] Trial 11 pruned. 
[I 2023-07-02 22:43:17,375] Trial 10 pruned. 
[I 2023-07-02 22:43:37,820] Trial 1 pruned. 

[I 2023-07-03 00:14:51,028] Trial 79 finished with value: 0.7151340343857795 and parameters: {'w1': 0.5022071347443534, 'w2': 0.3759217126408302, 'w3': 0.32914775896450077, 'w4': 0.8836623374260081, 'w5': 0.8234182730263342, 'w6': 0.24085606690621703}. Best is trial 69 with value: 0.715174990264108.


In [73]:
study.optimize(objective, n_trials=200,n_jobs=-1,show_progress_bar=True)

  0%|          | 0/200 [00:00<?, ?it/s]

[I 2023-07-02 04:38:00,850] Trial 100 pruned. 
[I 2023-07-02 04:38:08,061] Trial 106 pruned. 
[I 2023-07-02 04:38:09,229] Trial 101 pruned. 
[I 2023-07-02 04:38:11,691] Trial 107 pruned. 
[I 2023-07-02 04:38:15,301] Trial 103 pruned. 
[I 2023-07-02 04:38:19,809] Trial 105 pruned. 
[I 2023-07-02 04:38:22,367] Trial 104 pruned. 
[I 2023-07-02 04:38:36,718] Trial 102 finished with value: 0.5853678041685173 and parameters: {'n_estimators': 518, 'learning_rate': 0.01628925247457682, 'lambda': 0.026769843999572506, 'alpha': 0.0894977092439457, 'subsample': 0.46765780750256974, 'colsample_bytree': 0.4327735480894985, 'max_depth': 8, 'min_child_weight': 95, 'eta': 0.02023749129398292, 'gamma': 0.002521154936835588, 'scale_pos_weight': 76, 'grow_policy': 'depthwise'}. Best is trial 97 with value: 0.5864109260560468.
[I 2023-07-02 04:38:44,811] Trial 108 pruned. 
[I 2023-07-02 04:39:25,357] Trial 114 pruned. 
[I 2023-07-02 04:39:29,203] Trial 116 pruned. 
[I 2023-07-02 04:39:38,474] Trial 109 fi

KeyboardInterrupt: 

In [64]:
study.optimize(objective, n_trials=200,n_jobs=-1,show_progress_bar=True)

  0%|          | 0/200 [00:00<?, ?it/s]

[I 2023-07-02 03:30:50,004] Trial 407 pruned. 
[I 2023-07-02 03:30:50,437] Trial 406 pruned. 
[I 2023-07-02 03:30:50,686] Trial 403 pruned. 
[I 2023-07-02 03:30:50,697] Trial 400 pruned. 
[I 2023-07-02 03:30:51,567] Trial 405 pruned. 
[I 2023-07-02 03:30:52,000] Trial 404 pruned. 
[I 2023-07-02 03:30:52,919] Trial 401 pruned. 
[I 2023-07-02 03:30:54,090] Trial 402 pruned. 
[I 2023-07-02 03:30:55,434] Trial 409 pruned. 
[I 2023-07-02 03:30:55,586] Trial 410 pruned. 
[I 2023-07-02 03:30:55,715] Trial 413 pruned. 
[I 2023-07-02 03:30:55,823] Trial 414 pruned. 
[I 2023-07-02 03:30:57,228] Trial 408 pruned. 
[I 2023-07-02 03:30:57,608] Trial 415 pruned. 
[I 2023-07-02 03:30:58,953] Trial 412 pruned. 
[I 2023-07-02 03:31:00,786] Trial 411 pruned. 
[I 2023-07-02 03:31:01,635] Trial 417 pruned. 
[I 2023-07-02 03:31:01,959] Trial 416 pruned. 
[I 2023-07-02 03:31:05,312] Trial 424 pruned. 
[I 2023-07-02 03:31:05,447] Trial 419 pruned. 
[I 2023-07-02 03:31:06,141] Trial 418 finished with value: 0

In [65]:
study.optimize(objective, n_trials=200,n_jobs=-1,show_progress_bar=True)

  0%|          | 0/200 [00:00<?, ?it/s]

[I 2023-07-02 03:33:46,693] Trial 606 pruned. 
[I 2023-07-02 03:33:47,371] Trial 607 pruned. 
[I 2023-07-02 03:33:48,997] Trial 604 finished with value: 0.5850008369683112 and parameters: {'n_estimators': 178, 'learning_rate': 0.030165505879538184, 'num_leaves': 21, 'max_depth': 160, 'min_child_samples': 146, 'min_child_weight': 167, 'subsample': 0.8115293558151238, 'colsample_bytree': 0.20674850112515608, 'reg_alpha': 0.19803303233069153, 'reg_lambda': 0.3299965774581455}. Best is trial 574 with value: 0.5859887520781273.
[I 2023-07-02 03:33:49,162] Trial 601 pruned. 
[I 2023-07-02 03:33:49,747] Trial 602 pruned. 
[I 2023-07-02 03:33:49,987] Trial 603 finished with value: 0.5850541888412838 and parameters: {'n_estimators': 186, 'learning_rate': 0.029265592575978088, 'num_leaves': 24, 'max_depth': 162, 'min_child_samples': 138, 'min_child_weight': 165, 'subsample': 0.7930587422510882, 'colsample_bytree': 0.22565142990120426, 'reg_alpha': 0.19389117884739276, 'reg_lambda': 0.31870708867

In [66]:
study.optimize(objective, n_trials=200,n_jobs=-1,show_progress_bar=True)

  0%|          | 0/200 [00:00<?, ?it/s]

[I 2023-07-02 03:36:45,685] Trial 805 pruned. 
[I 2023-07-02 03:36:45,933] Trial 802 pruned. 
[I 2023-07-02 03:36:47,942] Trial 807 pruned. 
[I 2023-07-02 03:36:48,644] Trial 801 pruned. 
[I 2023-07-02 03:36:50,709] Trial 800 pruned. 
[I 2023-07-02 03:36:50,738] Trial 809 pruned. 
[I 2023-07-02 03:36:51,634] Trial 811 pruned. 
[I 2023-07-02 03:36:53,225] Trial 810 pruned. 
[I 2023-07-02 03:36:53,523] Trial 803 finished with value: 0.5852656239222036 and parameters: {'n_estimators': 206, 'learning_rate': 0.028080283077030185, 'num_leaves': 23, 'max_depth': 152, 'min_child_samples': 155, 'min_child_weight': 168, 'subsample': 0.6480534852074774, 'colsample_bytree': 0.1993697854715269, 'reg_alpha': 0.80495864777286, 'reg_lambda': 0.07512886087805769}. Best is trial 635 with value: 0.5864747718200681.
[I 2023-07-02 03:36:53,944] Trial 804 finished with value: 0.5853074760230382 and parameters: {'n_estimators': 211, 'learning_rate': 0.027447568321228303, 'num_leaves': 178, 'max_depth': 122, 

In [67]:
study.optimize(objective, n_trials=200,n_jobs=-1,show_progress_bar=True)

  0%|          | 0/200 [00:00<?, ?it/s]

[I 2023-07-02 03:40:03,215] Trial 1004 pruned. 
[I 2023-07-02 03:40:03,411] Trial 1006 pruned. 
[I 2023-07-02 03:40:04,986] Trial 1005 pruned. 
[I 2023-07-02 03:40:05,796] Trial 1007 pruned. 
[I 2023-07-02 03:40:07,014] Trial 1000 pruned. 
[I 2023-07-02 03:40:08,107] Trial 1002 pruned. 
[I 2023-07-02 03:40:08,620] Trial 1009 pruned. 
[I 2023-07-02 03:40:09,869] Trial 1001 pruned. 
[I 2023-07-02 03:40:10,592] Trial 1011 pruned. 
[I 2023-07-02 03:40:10,998] Trial 1012 pruned. 
[I 2023-07-02 03:40:11,949] Trial 1010 pruned. 
[I 2023-07-02 03:40:12,174] Trial 1003 pruned. 
[I 2023-07-02 03:40:13,869] Trial 1014 pruned. 
[I 2023-07-02 03:40:16,160] Trial 1018 pruned. 
[I 2023-07-02 03:40:17,353] Trial 1019 pruned. 
[I 2023-07-02 03:40:17,767] Trial 1016 finished with value: 0.584716539240983 and parameters: {'n_estimators': 147, 'learning_rate': 0.027555979962876162, 'num_leaves': 154, 'max_depth': 147, 'min_child_samples': 127, 'min_child_weight': 165, 'subsample': 0.7969346276001489, 'col

In [31]:
# ## Hyper parameter tuning using optuna for lgbm

# import optuna
# from optuna.samplers import TPESampler
# from optuna.pruners import SuccessiveHalvingPruner
# from lightgbm import LGBMClassifier
# from optuna import Trial

# def objective(trial: Trial):
#     #params for lgbm
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5,log=True),
#         'num_leaves': trial.suggest_int('num_leaves', 10, 256),
#         'max_depth': trial.suggest_int('max_depth', 10, 100),
#         'min_child_samples': trial.suggest_int('min_child_samples', 2, 256),
#         'min_child_weight': trial.suggest_int('min_child_weight', 2, 256),
#         'subsample': trial.suggest_float('subsample', 0.1, 1.0,log=True),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0,log=True),
#         'reg_alpha': trial.suggest_float('reg_alpha', 0.0001, 0.99,log=True),
#         'reg_lambda': trial.suggest_float('reg_lambda', 0.0001, 0.99,log=True),
#         'random_state': 42,
#         'n_jobs': -1,
#         'objective': 'binary',
#         'metric': 'auc',
#     }
#     model_obj=MultiOutputClassifier(LGBMClassifier(**params))
#     skf = KFold(n_splits=5,shuffle=True,random_state=42)
#     scores=[]
#     i=0
#     for train_index, test_index in skf.split(X, y):
#         X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
#         y_train, y_test = y.iloc[train_index,:], y.iloc[test_index,:]
#         # preprocess
#         X_train,y_train=preprocess(X_train,y_train)
#         # get pipeline
#         model=get_pipeline(model_obj)
#         # fit model
#         model.fit(X_train,y_train)
#         # predict
#         y_pred=model.predict_proba(X_test)
#         val_preds = np.array(y_pred)[:,:,1].T
#         score=roc_auc_score(y_test,val_preds)
#         scores.append(score)
#         trial.report(score, i)
#         if trial.should_prune():
#             raise optuna.TrialPruned()
#         i+=1
#     return np.mean(scores)


In [32]:
# sampler = optuna.samplers.TPESampler(seed=42)
# pruner = SuccessiveHalvingPruner(min_resource=1, reduction_factor=2, min_early_stopping_rate=0)
# study = optuna.create_study(pruner=pruner,sampler=sampler,direction='maximize')
# study.optimize(objective, n_trials=100,n_jobs=-1,show_progress_bar=True)


[I 2023-06-28 00:40:15,124] A new study created in memory with name: no-name-e48fa94d-890f-4e49-bea5-fdbde71053dd


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-06-28 00:40:34,094] Trial 3 pruned. 
[I 2023-06-28 00:40:35,493] Trial 5 pruned. 
[I 2023-06-28 00:40:39,133] Trial 4 pruned. 
[I 2023-06-28 00:40:50,586] Trial 2 pruned. 
[I 2023-06-28 00:41:02,869] Trial 0 finished with value: 0.6410358661189715 and parameters: {'n_estimators': 405, 'learning_rate': 0.009643984562230094, 'num_leaves': 158, 'max_depth': 55, 'min_child_samples': 7, 'min_child_weight': 124, 'subsample': 0.17567177463120887, 'colsample_bytree': 0.8079955728992779, 'reg_alpha': 0.0029323371798063184, 'reg_lambda': 0.3095981889431533}. Best is trial 0 with value: 0.6410358661189715.
[I 2023-06-28 00:41:05,977] Trial 9 pruned. 
[I 2023-06-28 00:41:12,111] Trial 6 pruned. 
[I 2023-06-28 00:41:16,092] Trial 1 pruned. 
[I 2023-06-28 00:41:17,576] Trial 8 pruned. 
[I 2023-06-28 00:41:23,208] Trial 13 pruned. 
[I 2023-06-28 00:41:31,970] Trial 7 pruned. 
[I 2023-06-28 00:41:40,765] Trial 11 pruned. 
[I 2023-06-28 00:41:43,937] Trial 14 pruned. 
[I 2023-06-28 00:41:46,358

In [33]:
study.best_params

{'n_estimators': 998,
 'learning_rate': 0.004565896907186115,
 'num_leaves': 119,
 'max_depth': 42,
 'min_child_samples': 139,
 'min_child_weight': 200,
 'subsample': 0.11498053779651443,
 'colsample_bytree': 0.24049548351554417,
 'reg_alpha': 0.0013920533392223111,
 'reg_lambda': 0.6212494114464403}

## Hyper parameter tuning using optuna

In [139]:
# # define the hyperparameter space for lgbm
# from optuna.samplers import TPESampler
# import optuna
# from lightgbm import LGBMClassifier
# from optuna import Trial, visualization
# from optuna.pruners import SuccessiveHalvingPruner
# from sklearn.model_selection import StratifiedKFold

# def objective(trial: Trial):
#     params={
#         'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
#         'max_leaves': trial.suggest_int('max_leaves', 10, 200),
#         'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 1e3),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#         'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1e3),
#         'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1e3),
#         'num_leaves': trial.suggest_int('num_leaves', 10, 200),
#         'max_depth': trial.suggest_int('max_depth', 0, 20),
#         'random_state': 42,
#         'n_jobs': -1,
#     }
#     skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
#     scores=[]
#     i=0
#     for train_index, test_index in skf.split(X, y):
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, y_test = y.iloc[train_index], y.iloc[test_index]
#         model=LGBMClassifier(**params)
#         model.fit(X_train,y_train)
#         y_pred=model.predict_proba(X_test)[:,1]
#         trial.report(roc_auc_score(y_test,y_pred), i)
#         if trial.should_prune():
#             raise optuna.TrialPruned()
#         scores.append(roc_auc_score(y_test,y_pred))
#         i+=1
#     return np.mean(scores)

# sampler = optuna.samplers.TPESampler(seed=42)
# pruner = SuccessiveHalvingPruner(min_resource=1, reduction_factor=2, min_early_stopping_rate=0)
# study = optuna.create_study(pruner=pruner,sampler=sampler,direction='maximize')
# study.optimize(objective, n_trials=100,n_jobs=-1,show_progress_bar=True)

In [140]:
# study.optimize(objective, n_trials=100,n_jobs=-1,show_progress_bar=True)

In [141]:
    # # Define the hyperparameter space
    # params = {
    #     'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    #     'max_leaves': trial.suggest_int('max_leaves', 10, 200),
    #     'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 1e3),
    #     'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e0),
    #     'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    #     'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
    #     'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    #     'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1e3),
    #     'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1e3),
    # }
    
    # model = XGBClassifier(**params,loss_function=auc_loss_func)
    
    # skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
    # scores=[]
    # i=0
    # for train_index, test_index in skf.split(X, y):
    #     X_train, X_test = X[train_index], X[test_index]
    #     y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #     model.fit(X_train,y_train)
    #     y_pred=model.predict_proba(X_test)[:,1]
    #     trial.report(roc_auc_score(y_test,y_pred), step=i)
    #     scores.append(roc_auc_score(y_test,y_pred))
    #     if trial.should_prune():
    #         raise optuna.TrialPruned()
    #     i+=1
    # return np.mean(scores)

In [142]:
# import optuna
# from optuna.pruners import SuccessiveHalvingPruner
# # tune the weights of the voting classifier
# def objective(trial: optuna.Trial):
#     w1 = trial.suggest_float("w1", 0.0, 1.0)
#     w2 = trial.suggest_float("w2", 0.0, 1.0)
#     w3 = trial.suggest_float("w3", 0.0, 1.0)
#     w4 = trial.suggest_float("w4", 0.0, 1.0)
#     w5 = trial.suggest_float("w5", 0.0, 1.0)
#     weights=[w1,w2,w3,w4,w5]
#     skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
#     scores=[]
#     i=0
#     for train_index, test_index in skf.split(X, y):
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, y_test = y.iloc[train_index], y.iloc[test_index]
#         vc=VotingClassifier(estimators=estimators,n_jobs=-1,voting='soft',weights=weights)
#         vc.fit(X_train,y_train)
#         y_pred=vc.predict_proba(X_test)[:,1]
#         scores.append(roc_auc_score(y_test,y_pred))
#         i+=1
        
#     return np.mean(scores)
        

        
        
# sampler = optuna.samplers.TPESampler(seed=42)
# pruner = SuccessiveHalvingPruner(min_resource=1, reduction_factor=2, min_early_stopping_rate=0)
# study = optuna.create_study(pruner=pruner,sampler=sampler,direction='maximize')
# study.optimize(objective, n_trials=100,n_jobs=-1,show_progress_bar=True)

## Submission

In [191]:
test=pd.read_csv('test.csv')

In [192]:
test.isnull().sum().sum()

0

In [193]:
test_id=test['id']
test_data=test.drop(['id'],axis=1)
#test_data=test.drop(['id','Product ID'],axis=1)

In [194]:
test_data=test_data[features]

In [195]:
# preprocess
# test_data,_=preprocess(test_data,None)

In [196]:
preds1=vc_l1.predict_proba(test_data)[:,1]
preds2=vc_l2.predict_proba(test_data)[:,1]

In [197]:
sub=pd.DataFrame({'id':test_id,'EC1':preds1,'EC2':preds2})

In [198]:
sub.to_csv('sub.csv',index=False)

In [199]:
sub['EC1'].max()

0.9649703118426022

In [200]:
!kaggle competitions submit -c playground-series-s3e18 -f sub.csv -m "lgbm"

Successfully submitted to Explore Multi-Label Classification with an Enzyme Substrate Dataset



  0%|          | 0.00/432k [00:00<?, ?B/s]
  2%|▏         | 8.00k/432k [00:00<00:38, 11.3kB/s]
 22%|██▏       | 96.0k/432k [00:00<00:02, 151kB/s] 
 33%|███▎      | 144k/432k [00:00<00:01, 200kB/s] 
 50%|████▉     | 216k/432k [00:01<00:00, 298kB/s]
 65%|██████▍   | 280k/432k [00:01<00:00, 362kB/s]
 80%|███████▉  | 344k/432k [00:01<00:00, 427kB/s]
100%|██████████| 432k/432k [00:02<00:00, 178kB/s]
