# XGBoost Classifier
## Hyper-parameter Tunning

In [1]:
import warnings
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
warnings.filterwarnings('ignore')

df_train = pd.read_csv('Final_Train_dataset.csv')
X_train  = df_train.iloc[:,:-1]
y_train  = df_train.VirusDetected

In [2]:
# Specify different values for the tunning process
StratifiedKFold = RepeatedStratifiedKFold(n_splits     = 3, 
                                          n_repeats    = 2, 
                                          random_state = 0)

#n_estimators     = [int(x) for x in np.linspace(start = 50, stop = 250, num = 50)]
learning_rate    = [0.05, 0.2 , 0.25, 0.3 , 0.45, 0.5]
max_depth        = [int(x) for x in np.linspace(1, 10, num = 5)]
min_child_weight = [1, 3, 5]
gamma            = [0.0, 0.1, 0.3]
reg_lambda       = [0, 1, 5 ]
colsample_bytree = [0.3, 0.5 , 0.7 ]

#Create parameter grid
XGB_params={
    #"n_estimators"     : n_estimators,  
    "learning_rate"    : learning_rate ,
    "max_depth"        : max_depth,
    "min_child_weight" : min_child_weight,
    "gamma"            : gamma,
    "reg_lambda"       : reg_lambda,
    "colsample_bytree" : colsample_bytree}

#Create XGBoost object
XGB_model = XGBClassifier()

#Grid Search CV
XGB_search = GridSearchCV(XGB_model, 
                      XGB_params, 
                      scoring = 'accuracy', 
                      cv = StratifiedKFold, 
                      n_jobs = -1, 
                      verbose = True)

In [3]:
from datetime import datetime

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
        
start_time = timer(None) # timing starts from this point for "start_time" variable
XGB_search.fit(X_train,y_train)
timer(start_time) # timing ends here for "start_time" variable

Fitting 6 folds for each of 2430 candidates, totalling 14580 fits

 Time taken: 0 hours 2 minutes and 32.71 seconds.


In [4]:
XGB_search.best_params_

{'colsample_bytree': 0.3,
 'gamma': 0.0,
 'learning_rate': 0.05,
 'max_depth': 1,
 'min_child_weight': 1,
 'reg_lambda': 0}

- Specify the optimal model 

In [5]:
optimal_model = XGBClassifier(colsample_bytree = 0.3,
                            learning_rate     = 0.05,
                            max_depth         = 1, 
                            min_child_weight  = 1).fit(X_train, y_train)