In [1]:
# Timing Libraries :
import time

# Warning Libraries :
import warnings
warnings.filterwarnings("ignore")

# Scientific and Data Manipulation Libraries :
import pandas as pd
import numpy as np
import math
import gc
import os

# NLP Libraries :
import nltk
from nltk.tokenize                    import word_tokenize
from nltk                             import pos_tag
from nltk.corpus                      import wordnet as wn
from nltk.corpus                      import stopwords
from nltk.stem                        import WordNetLemmatizer
from collections                      import defaultdict
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')

# ML Libraries :
from sklearn.externals                import joblib
from sklearn.preprocessing            import LabelEncoder, OneHotEncoder 
from sklearn.preprocessing            import StandardScaler, MinMaxScaler, Normalizer, RobustScaler, MaxAbsScaler
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.model_selection          import KFold, StratifiedKFold, train_test_split, cross_val_score
from sklearn.linear_model             import MultiTaskElasticNet, ElasticNet, Lasso, RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier, LogisticRegression, LinearRegression
from sklearn.neighbors                import KNeighborsClassifier
from sklearn.svm                      import SVC
from sklearn.tree                     import DecisionTreeClassifier
from sklearn.ensemble                 import VotingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes              import MultinomialNB, GaussianNB, ComplementNB
from sklearn.discriminant_analysis    import LinearDiscriminantAnalysis
from sklearn.metrics                  import f1_score, accuracy_score, precision_score , recall_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.decomposition            import IncrementalPCA
from statsmodels.stats.proportion     import proportion_confint

# Boosting Algorithms :
from xgboost                          import XGBClassifier
from catboost                         import CatBoostClassifier
from lightgbm                         import LGBMClassifier

# Data Visualization Packages :
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
# Set Location path :
import os
os.chdir("C:/Users/LD196YS/Desktop/0.                          Winning_Kaggle/0. HackerEarth Hackathon/")

In [3]:
#read data

train = pd.read_csv('1. Data/Train.csv')
train.drop(["INCIDENT_ID"],axis = 1,inplace=True)

test = pd.read_csv('1. Data/Test.csv')
test_ID = test["INCIDENT_ID"]
test.drop(["INCIDENT_ID"],axis = 1,inplace=True)

sub = pd.read_csv('1. Data/submission.csv')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23856 entries, 0 to 23855
Data columns (total 17 columns):
DATE                23856 non-null object
X_1                 23856 non-null int64
X_2                 23856 non-null int64
X_3                 23856 non-null int64
X_4                 23856 non-null int64
X_5                 23856 non-null int64
X_6                 23856 non-null int64
X_7                 23856 non-null int64
X_8                 23856 non-null int64
X_9                 23856 non-null int64
X_10                23856 non-null int64
X_11                23856 non-null int64
X_12                23674 non-null float64
X_13                23856 non-null int64
X_14                23856 non-null int64
X_15                23856 non-null int64
MULTIPLE_OFFENSE    23856 non-null int64
dtypes: float64(1), int64(15), object(1)
memory usage: 3.1+ MB


In [5]:
d1=train.isnull().sum().to_frame().rename(columns={0: "Train_Null_Values"})
d2=test.isnull().sum().to_frame().rename(columns={0: "Test_Null_Values"})
d3=train.dtypes.to_frame().rename(columns={0: "Data_Type"})
table = pd.concat([d1, d2,d3], axis=1)
display(table)

Unnamed: 0,Train_Null_Values,Test_Null_Values,Data_Type
DATE,0,0.0,object
MULTIPLE_OFFENSE,0,,int64
X_1,0,0.0,int64
X_10,0,0.0,int64
X_11,0,0.0,int64
X_12,182,127.0,float64
X_13,0,0.0,int64
X_14,0,0.0,int64
X_15,0,0.0,int64
X_2,0,0.0,int64


In [6]:
print(train.shape)

(23856, 17)


In [7]:
# dropping duplicate values 
train.drop_duplicates(keep=False,inplace=True) 
print(train.shape)

(23846, 17)


In [8]:
train = train.ffill(axis = 0)
# train = train.bfill(axis = 0)

test = test.ffill(axis = 0)
# test = test.bfill(axis = 0)

In [9]:
d1=train.isnull().sum().to_frame().rename(columns={0: "Train_Null_Values"})
d2=test.isnull().sum().to_frame().rename(columns={0: "Test_Null_Values"})
d3=train.dtypes.to_frame().rename(columns={0: "Data_Type"})
table = pd.concat([d1, d2,d3], axis=1)
display(table)

Unnamed: 0,Train_Null_Values,Test_Null_Values,Data_Type
DATE,0,0.0,object
MULTIPLE_OFFENSE,0,,int64
X_1,0,0.0,int64
X_10,0,0.0,int64
X_11,0,0.0,int64
X_12,0,1.0,float64
X_13,0,0.0,int64
X_14,0,0.0,int64
X_15,0,0.0,int64
X_2,0,0.0,int64


In [10]:
train.columns

Index(['DATE', 'X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7', 'X_8', 'X_9',
       'X_10', 'X_11', 'X_12', 'X_13', 'X_14', 'X_15', 'MULTIPLE_OFFENSE'],
      dtype='object')

In [11]:
X_train = train[[ 'DATE', 'X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7',
       'X_8', 'X_9', 'X_10', 'X_11', 'X_12', 'X_13', 'X_14', 'X_15']]
y_train = train['MULTIPLE_OFFENSE']
y_train = y_train.to_frame()

X_test = test[[ 'DATE', 'X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7',
       'X_8', 'X_9', 'X_10', 'X_11', 'X_12', 'X_13', 'X_14', 'X_15' ]]

# # Binning :
# X_train['age'] = pd.cut( x=X_train['age'], bins=[20, 29, 39, 49], labels=['20', '30', '40'] )
# X_test['age']  = pd.cut(x=X_test['age'], bins=[20, 29, 39, 49], labels=['20', '30', '40'] )

In [12]:
X_test.shape

(15903, 16)

In [13]:
# plt.figure(figsize=(24, 6))
# plt.subplot(121)
# sns.distplot(train["avg_training_score"])
# plt.subplot(122)
# train["avg_training_score"] = np.log1p(train["avg_training_score"])
# sns.distplot(train["avg_training_score"])
# plt.show()

In [14]:
def data_encoding( encoding_strategy , encoding_data , encoding_columns ):
    
    if encoding_strategy == "LabelEncoding":
        print("IF LabelEncoding")
        Encoder = LabelEncoder()
        for column in encoding_columns :
            print("column",column )
            encoding_data[ column ] = Encoder.fit_transform(tuple(encoding_data[ column ]))
        
    elif encoding_strategy == "OneHotEncoding":
        print("ELIF OneHotEncoding")
        encoding_data = pd.get_dummies(encoding_data)
        
    elif encoding_strategy == "TargetEncoding":
        print("ELIF TargetEncoding")
        ## Code Coming soon
        print("TargetEncoding")

    else :
        print("ELSE OneHotEncoding")
        encoding_data = pd.get_dummies(encoding_data)
        
    dtypes_list =['float64','float32','int64','int32']
    encoding_data.astype( dtypes_list[0] ).dtypes
    
    return encoding_data

In [15]:
encoding_columns  = [ "DATE" ]
encoding_strategy = [ "LabelEncoding", "OneHotEncoding", "TargetEncoding", "ELSE" ]

X_train_encode = data_encoding( encoding_strategy[0] , X_train , encoding_columns )
X_test_encode =  data_encoding( encoding_strategy[0] , X_test  , encoding_columns )

display(X_train_encode.shape)
display(X_test_encode.shape)

display(X_train_encode.head())
display(X_test_encode.head())

IF LabelEncoding
column DATE
IF LabelEncoding
column DATE


(23846, 16)

(15903, 16)

Unnamed: 0,DATE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15
0,1032,0,36,34,2,1,5,6,1,6,1,174,1.0,92,29,36
1,5267,1,37,37,0,0,11,17,1,6,1,236,1.0,103,142,34
2,4426,0,3,2,3,5,1,0,2,3,1,174,1.0,110,93,34
3,3703,0,33,32,2,1,7,1,1,6,1,249,1.0,72,29,34
4,3622,0,33,32,2,1,8,3,0,5,1,174,0.0,112,29,43


Unnamed: 0,DATE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15
0,83,0,30,35,7,3,6,4,0,5,1,174,,72,119,23
1,1220,0,44,44,1,3,7,1,4,6,1,316,0.0,12,29,34
2,6891,0,34,33,3,5,2,7,3,0,1,316,1.0,72,0,34
3,4491,7,3,2,3,5,9,8,0,5,1,174,1.0,112,87,34
4,7926,0,7,8,7,3,2,7,1,5,1,174,0.0,112,93,43


In [16]:
def data_scaling( scaling_strategy , scaling_data , scaling_columns ):
    
    if    scaling_strategy =="RobustScaler" :
        scaling_data[scaling_columns] = RobustScaler().fit_transform(scaling_data[scaling_columns])
        
    elif  scaling_strategy =="StandardScaler" :
        scaling_data[scaling_columns] = StandardScaler().fit_transform(scaling_data[scaling_columns])
        
    elif  scaling_strategy =="MinMaxScaler" :
        scaling_data[scaling_columns] = MinMaxScaler().fit_transform(scaling_data[scaling_columns])
        
    elif  scaling_strategy =="MaxAbsScaler" :
        scaling_data[scaling_columns] = MaxAbsScaler().fit_transform(scaling_data[scaling_columns])
        
    else :  # If any other scaling send by mistake still perform Robust Scalar
        scaling_data[scaling_columns] = RobustScaler().fit_transform(scaling_data[scaling_columns])
    
    return scaling_data

In [26]:
scaling_strategy = ["RobustScaler", "StandardScaler","MinMaxScaler","MaxAbsScaler"]
X_train_scale = data_scaling( scaling_strategy[1] , X_train_encode , X_train_encode.columns )
X_test_scale = data_scaling( scaling_strategy [1] , X_test_encode , X_test_encode.columns )

display(X_train_scale.shape)
display(X_test_scale.shape)

display(X_train_scale.head())
display(X_test_scale.head())

(23846, 16)

(15903, 16)

Unnamed: 0,DATE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15
0,-1.330017,-0.335925,0.735687,0.618785,-0.773555,-0.741818,-0.257957,0.289528,0.018727,0.789433,-0.218763,-0.354138,0.022551,0.245213,-1.008681,0.302579
1,0.277142,0.359126,0.801297,0.816979,-1.452748,-1.251202,1.083838,3.123078,0.018727,0.789433,-0.218763,0.312219,0.022551,0.643755,1.600982,0.064116
2,-0.042013,-0.335925,-1.429435,-1.495276,-0.433959,1.295719,-1.152487,-1.256046,0.706812,-1.411786,-0.218763,-0.354138,0.022551,0.897372,0.469358,0.064116
3,-0.316387,-0.335925,0.538858,0.486657,-0.773555,-0.741818,0.189308,-0.99845,0.018727,0.789433,-0.218763,0.451939,0.022551,-0.479407,-1.008681,0.064116
4,-0.347126,-0.335925,0.538858,0.486657,-0.773555,-0.741818,0.412941,-0.483259,-0.669358,0.055694,-0.218763,-0.354138,-0.835101,0.969834,-1.008681,1.137199


Unnamed: 0,DATE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15
0,-1.675947,-0.331491,0.346549,0.689163,0.913022,0.280919,-0.019072,-0.223796,-0.670686,0.066374,-0.286551,-0.353946,,-0.47981,1.076998,-1.25362
1,-1.180836,-0.331491,1.265886,1.284127,-1.10413,0.280919,0.205598,-1.002121,2.048132,0.793841,-0.286551,1.171216,-1.109829,-2.662237,-0.994975,0.069446
2,1.288621,-0.331491,0.609216,0.556949,-0.431746,1.299606,-0.917751,0.55453,1.368427,-3.570957,-0.286551,1.171216,0.031582,-0.47981,-1.66261,0.069446
3,0.243532,4.625778,-1.426459,-1.49237,-0.431746,1.299606,0.654937,0.813972,-0.670686,0.066374,-0.286551,-0.353946,0.031582,0.975141,0.340297,0.069446
4,1.739315,-0.331491,-1.163791,-1.095728,0.913022,0.280919,-0.917751,0.55453,0.009018,0.066374,-0.286551,-0.353946,-1.109829,0.975141,0.478428,1.151954


In [27]:
Classifiers = {
                 '1.1_XGBoost_Version_33'     : XGBClassifier(learning_rate =0.1, n_estimators=494, max_depth=5,subsample = 0.70, 
                                                              scale_pos_weight = 2.5,updater ="grow_histmaker",base_score  = 0.2,
                                                              silent=True),
    
   '0.1_XGBoost_Version_33'     :  XGBClassifier(base_score=0.5, gamma=0.3, learning_rate=0.1, max_delta_step=0, max_depth=5,
        missing=None, n_estimators=494, nthread=15,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=2.5,  silent=True, subsample=1),
                 
#                  '1.2_XGBoost_Version_34'     : XGBClassifier(learning_rate=0.1, n_estimators=494, max_depth=4, subsample=0.8,
#                                                               scale_pos_weight=2.5,min_child_weight=7, gamma=0.4, nthread=4,  
#                                                               colsample_bytree=0.8),
    
                 '2.1_CatBoost_Version_10': CatBoostClassifier(silent=True, learning_rate=0.15, n_estimators=494, subsample=0.085, max_depth=5, scale_pos_weight=2.5,
                                                                 random_strength= None, 
                                                                                              min_data_in_leaf=None,max_bin =None,l2_leaf_reg=None,bagging_temperature=None,depth=None,model_size_reg=None,rsm=None,loss_function=None,border_count=None,feature_border_type=None,per_float_feature_quantization=None,                         
                                                                                              input_borders=None,output_borders=None,fold_permutation_block=None,od_pval=None,od_wait=None,nan_mode=None,counter_calc_method=None,leaf_estimation_iterations=None,
                                                                                              leaf_estimation_method=None,thread_count=None,verbose=None,logging_level=None,metric_period=None,ctr_leaf_count_limit=None,store_all_simple_ctr=None,
                                                                                              max_ctr_complexity=None,has_time=None,allow_const_label=None,classes_count=None,class_weights=None,one_hot_max_size=None,name=None,ignored_features=None,train_dir=None,
                                                                                              custom_loss=None,custom_metric=None,eval_metric=None,save_snapshot=None,snapshot_file=None,snapshot_interval=None,fold_len_multiplier=None,
                                                                                              used_ram_limit=None,gpu_ram_part=None,allow_writing_files=None,final_ctr_computation_mode=None,approx_on_full_history=None,boosting_type=None,simple_ctr=None,combinations_ctr=None,per_feature_ctr=None,
                                                                                              task_type=None,device_config=None,devices=None,bootstrap_type=None,sampling_unit=None,dev_score_calc_obj_block_size=None,num_boost_round=None,
                                                                                              num_trees=None,colsample_bylevel=None,random_state=None,reg_lambda=None,objective=None,eta=None,gpu_cat_features_storage=None,data_partition=None,
                                                                                              metadata=None,cat_features=None,min_child_samples=None,max_leaves=None,num_leaves=None,score_function=None,
                                                                                              leaf_estimation_backtracking=None,ctr_history_unit=None,monotone_constraints=None,feature_weights=None,penalties_coefficient=None,first_feature_use_penalties=None,model_shrink_rate=None,
                                                                                              model_shrink_mode=None,langevin=None,diffusion_temperature=None,boost_from_average=None,text_features=None,tokenizers=None,dictionaries=None,feature_calcers=None,text_processing=None),
               
                '2.2_CatBoost_Version_10': CatBoostClassifier(silent=True, learning_rate=0.15, n_estimators=494, subsample=0.085, max_depth=5, scale_pos_weight=2.5,
                                                                 random_strength= 0.157, 
                                                                                              min_data_in_leaf=None,max_bin =None,l2_leaf_reg=None,bagging_temperature=None,depth=None,model_size_reg=None,rsm=None,loss_function=None,border_count=None,feature_border_type=None,per_float_feature_quantization=None,                         
                                                                                              input_borders=None,output_borders=None,fold_permutation_block=None,od_pval=None,od_wait=None,nan_mode=None,counter_calc_method=None,leaf_estimation_iterations=None,
                                                                                              leaf_estimation_method=None,thread_count=None,verbose=None,logging_level=None,metric_period=None,ctr_leaf_count_limit=None,store_all_simple_ctr=None,
                                                                                              max_ctr_complexity=None,has_time=None,allow_const_label=None,classes_count=None,class_weights=None,one_hot_max_size=None,name=None,ignored_features=None,train_dir=None,
                                                                                              custom_loss=None,custom_metric=None,eval_metric=None,save_snapshot=None,snapshot_file=None,snapshot_interval=None,fold_len_multiplier=None,
                                                                                              used_ram_limit=None,gpu_ram_part=None,allow_writing_files=None,final_ctr_computation_mode=None,approx_on_full_history=None,boosting_type=None,simple_ctr=None,combinations_ctr=None,per_feature_ctr=None,
                                                                                              task_type=None,device_config=None,devices=None,bootstrap_type=None,sampling_unit=None,dev_score_calc_obj_block_size=None,num_boost_round=None,
                                                                                              num_trees=None,colsample_bylevel=None,random_state=None,reg_lambda=None,objective=None,eta=None,gpu_cat_features_storage=None,data_partition=None,
                                                                                              metadata=None,cat_features=None,min_child_samples=None,max_leaves=None,num_leaves=None,score_function=None,
                                                                                              leaf_estimation_backtracking=None,ctr_history_unit=None,monotone_constraints=None,feature_weights=None,penalties_coefficient=None,first_feature_use_penalties=None,model_shrink_rate=None,
                                                                                              model_shrink_mode=None,langevin=None,diffusion_temperature=None,boost_from_average=None,text_features=None,tokenizers=None,dictionaries=None,feature_calcers=None,text_processing=None),
#                 '3.1_LightGBM_Verion_WINNING': LGBMClassifier(boosting_type='dart',
#                        max_depth=5, scale_pos_weight=2.5,
#                        learning_rate=0.05,
#                        n_estimators=5000,
#                        min_child_weight=0.01,
#                        colsample_bytree=0.5,
#                        random_state=1994),
               '3.1_LightGBM_Version_1.1' : LGBMClassifier(  subsample_freq = 2, objective ="binary",importance_type = "gain",verbosity = -1, max_bin = 60,num_leaves = 300,boosting_type = 'dart',learning_rate=0.15, n_estimators=494, max_depth=5, scale_pos_weight=2.5),
               '3.2_LightGBM_Version_2.1' : LGBMClassifier(  bagging_fraction=0.9, feature_fraction=0.9, subsample_freq = 2, objective ="binary",importance_type = "gain",verbosity = -1, max_bin = 60,num_leaves = 300,boosting_type = 'dart',learning_rate=0.15, n_estimators=494, max_depth=5, scale_pos_weight=2.5),

#  'feature_fraction': 0.5004666960515116,
#  'lambda_l2': 0.022577930769472343,
#  'min_data_in_leaf': 99,
#  'num_leaves': 13
    
               '4.1_GradBoost'            : GradientBoostingClassifier(min_samples_split= 4,max_depth=5, n_estimators=1000, subsample=0.70)
              }

print( list(Classifiers.keys()) )
print( list(Classifiers.values()) )

['1.1_XGBoost_Version_33', '0.1_XGBoost_Version_33', '2.1_CatBoost_Version_10', '2.2_CatBoost_Version_10', '3.1_LightGBM_Version_1.1', '3.2_LightGBM_Version_2.1', '4.1_GradBoost']
[XGBClassifier(base_score=0.2, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.1, max_delta_step=None, max_depth=5,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=494, n_jobs=None, num_parallel_tree=None,
              objective='binary:logistic', random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=2.5, silent=True, subsample=0.7,
              tree_method=None, updater='grow_histmaker',
              validate_parameters=False, verbosity=None), XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsa

In [28]:
# Public Leaderboard F1 Score : 0.533762057877814
# -------------------------------------------------
ensemble_model = VotingClassifier(estimators=[
#                                               ('XGBoost_Best_1', list(Classifiers.values())[0]), 
#                                               ('XGBoost_Best_2', list(Classifiers.values())[1]),
                                              ('CatBoost_Best_1', list(Classifiers.values())[2]),
                                              ('CatBoost_Best_2', list(Classifiers.values())[3]), 
                                              ('LightGBM_1', list(Classifiers.values())[4]),
                                              ('LightGBM_2', list(Classifiers.values())[5]),
                                             ], 
                                              voting='hard'
#                                   ,weights=[5,5,5]
                                 )

# Public Leaderboard F1 Score : 0.513816280806572
# -----------------------------------------------
# ensemble_model = XGBClassifier    (   learning_rate =0.1, n_estimators=494, max_depth=5,subsample = 0.70, 
#                                       scale_pos_weight = 2.5,updater ="grow_histmaker",base_score  = 0.2,
#                                       silent=True
#                                   )
# Public Leaderboard F1 Score : 0.5273311897106109
# ------------------------------------------------
# ensemble_model = CatBoostClassifier(  silent=True, learning_rate=0.15, n_estimators=494, subsample=0.085, 
#                                         max_depth=5, scale_pos_weight=2.5 
#                                    )
# Public Leaderboard F1 Score : 0.5196324143692564
# -----------------------------------------------
# ensemble_model = LGBMClassifier   (   subsample_freq = 2, objective ="binary",importance_type = "gain",verbosity = -1, 
#                                         max_bin = 60,num_leaves = 300,boosting_type = 'dart',
#                                         learning_rate=0.15, n_estimators=494, max_depth=5, scale_pos_weight=2.5
#                                      )

In [29]:
# ensemble_model = list(Classifiers.values())[4]

file_name = "2_3_4_5"
model = ensemble_model.fit(X_train_scale,y_train)
# Predicted_predictions_model = model.predict_proba( X_test_scale )[::,1]

Predicted_predictions_model = model.predict( X_test_scale )

predictions = [int(round(value)) for value in Predicted_predictions_model]
Result_Promoted = pd.DataFrame({'INCIDENT_ID': test_ID, 'MULTIPLE_OFFENSE' : predictions})
pd.DataFrame(Result_Promoted).to_csv(r"1. Final Data/2. Output/Predicted_"+file_name+".csv",index=False)

In [21]:
STOP !!!!

SyntaxError: invalid syntax (<ipython-input-21-47ffef4c871a>, line 1)

In [None]:
# Set up folds
K = 10
kf = KFold(n_splits = K, random_state = 294, shuffle = True)
skf = StratifiedKFold(n_splits = K, random_state = 294, shuffle = True)

In [None]:
X = X_train_scale
y = y_train

X_test = X_test_scale
y_valid_pred = 0*y
y_test_pred = 0
recall = 0
result={}
#specifying categorical variables indexes
# cat_columns = ["department", "region", "education", "gender", "recruitment_channel"]
#fitting catboost classifier model
j=1

model = ensemble_model

# for i, (train_index, test_index) in enumerate(kf.split(train)):

for train_index, test_index in kf.split(X, y):  
    
    if j in [1,2,3,4,5,6,7,8,9,10]:
        # Create data for this fold
        y_train_cv, y_valid_cv = y.iloc[train_index], y.iloc[test_index]
        X_train_cv, X_valid_cv = X.iloc[train_index,:], X.iloc[test_index,:]
        print( "\nFold ", j)
        #print( "\nFold ", i)

        # Run model for this fold
    #     if OPTIMIZE_ROUNDS:
    #         fit_model = model.fit( X_train_cv, y_train_cv, 
    #                                eval_set=[X_valid_cv, y_valid_cv]
    # #                                use_best_model=True
    #                              )
    #         print( "  N trees = ", model.tree_count_ )
    #     else:

        fit_model = model.fit( X_train_cv, y_train_cv )

        # Generate validation predictions for this fold
        pred = fit_model.predict(X_valid_cv)
        y_valid_pred.iloc[test_index] = pred.reshape(-1,1)
        print(recall_score(y_valid_cv,pred))
        recall+=recall_score(y_valid_cv,pred)
        # Accumulate test set predictions
        y_test_pred += fit_model.predict(X_test)
        result[j]=fit_model.predict(X_test)
    j+=1
results = y_test_pred / K  # Average test set predictions


In [None]:
# print(result)
d = pd.DataFrame()
for i in [1,2,3,4,5,6,7,8,9,10]:
    d = pd.concat([d,pd.DataFrame(result[i])],axis=1)
    
# d.columns=['2','4','6','9','10']
d.columns=['1','2','3','4','5','6','7','8','9','10']

predictions = d.mode(axis=1)[0]
file_name = "10_CV.csv"
Result_Promoted = pd.DataFrame({'INCIDENT_ID': test_ID, 'MULTIPLE_OFFENSE' : predictions})
pd.DataFrame(Result_Promoted).to_csv(r"1. Final Data/2. Output/Predicted_"+file_name+".csv",index=False)

In [None]:
X_CV_train, X_CV_val, y_CV_train, y_val = train_test_split(X_train_scale, y_train, test_size=0.9, 
                                                    random_state=294,shuffle=True)

In [None]:
def xgb_f1(y, t):
    t = t.get_label()
    y_bin = [1. if y_cont < 0.5 else 0. for y_cont in y]
    return 'f1', f1_score(t, y_bin)

In [None]:
ts = time.time()

ensemble_model = XGBClassifier    (   learning_rate =0.1, n_estimators=10000, max_depth=5,subsample = 0.70, 
                                      scale_pos_weight = 2.5,updater ="grow_histmaker",base_score  = 0.2,
                                      silent=True
                                  )

ensemble_model = ensemble_model.fit(
                                    X_CV_train,  y_CV_train,
                                    eval_metric=xgb_f1, 
                                    eval_set=[(X_CV_train,  y_CV_train), ( X_CV_val, y_val )], 
                                    verbose=True, 
                                    early_stopping_rounds = 100)

best_estimators=ensemble_model.best_iteration

print(best_estimators)

ensemble_model = XGBClassifier(
    
    n_estimators = best_estimators,
    
    learning_rate =0.1, max_depth=5,subsample = 0.70, 
    scale_pos_weight = 2.5,updater ="grow_histmaker",base_score  = 0.2,
    silent=True
    )

time.time() - ts

In [None]:
ensemble_model = XGBClassifier    (   
                                      learning_rate =0.1, n_estimators=306, max_depth=5, subsample = 0.70, 
                                      scale_pos_weight = 2.5, updater ="grow_histmaker", base_score  = 0.2,
                                      silent=True
                                  )

In [None]:
OR - STOP !!!

In [None]:
Voting_ensemble_model = VotingClassifier(estimators=[
                                              ('XGBoost_Best', list(Classifiers.values())[0]), 
                                              ('CatBoost_Best', list(Classifiers.values())[2]),
                                              ('LightGBM_1', list(Classifiers.values())[4]),
                                             ], 
                                              voting='soft',weights=[5,5,5.2])

In [None]:
MAX_ROUNDS = 1000
OPTIMIZE_ROUNDS = True
#LEARNING_RATE = 0.1

In [None]:
# X = X_train_scale
# y = y_train
# X_test = X_test_scale
# y_valid_pred = 0*y
# y_test_pred = 0
# f1score = 0
# result={}
# #specifying categorical variables indexes
# cat_columns = ["department", "region", "education", "gender", "recruitment_channel"]
# #fitting catboost classifier model
# j=1

# # model = CatBoostClassifier(n_estimators=MAX_ROUNDS,verbose=False)
# model = CatBoostClassifier(learning_rate=0.15, n_estimators=494, subsample=0.085, max_depth=5, scale_pos_weight=2.5,verbose=False)

# for i, (train_index, test_index) in enumerate(kf.split(X_train_scale)):

# #for train_index, test_index in skf.split(X, y):  
#     # Create data for this fold
#     y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
#     X_train, X_valid = X.iloc[train_index,:], X.iloc[test_index,:]
#     print( "\nFold ", j)
#     #print( "\nFold ", i)
    
#     # Run model for this fold
#     if OPTIMIZE_ROUNDS:
#         fit_model = model.fit( X_train, y_train, 
#                                eval_set=[X_valid, y_valid],cat_features=cat_columns,
#                                use_best_model=True
#                              )
#         print( "  N trees = ", model.tree_count_ )
#     else:
#         fit_model = model.fit( X_train, y_train,cat_features=cat_columns )
        
#     # Generate validation predictions for this fold
#     pred = fit_model.predict(X_valid)
#     y_valid_pred.iloc[test_index] = pred.reshape(-1)
#     print(f1_score(y_valid,pred))
#     f1score+=f1_score(y_valid,pred)
#     # Accumulate test set predictions
#     y_test_pred += fit_model.predict(X_test)
#     result[j]=fit_model.predict(X_test)
#     j+=1
# results = y_test_pred / K  # Average test set predictions
# print(f1score/5)