# Random Forest
## Hyper-parameter Tunning

In [7]:
import warnings
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
warnings.filterwarnings('ignore')

df_train = pd.read_csv('Final_Train_dataset.csv')
X_train  = df_train.iloc[:,:-1]
y_train  = df_train.VirusDetected

In [2]:
# Specify different values for the tunning process
kfold             = KFold(n_splits = 5, random_state = None, shuffle = False)

n_estimators      = [int(x) for x in np.linspace(start = 50, stop = 150, num = 5)] 
max_features      = ['auto', 'sqrt'] 
criterion         = ['gini','entropy']
max_depth         = [int(x) for x in np.linspace(5, 30, 6)] 
min_samples_split = [int(x) for x in np.linspace(2, 20, 6)] 
min_samples_leaf  = [int(x) for x in np.linspace(1, 20, 6)] 

#Create parameter grid
random_grid ={'n_estimators'     :n_estimators,
              'max_features'     :max_features,
              'criterion'        :criterion,
              'max_depth'        :max_depth,
              'min_samples_split':min_samples_split,
              'min_samples_leaf' :min_samples_leaf}

#Create Random Forest object
rf = RandomForestClassifier()

#Grid Search CV
rf_search = GridSearchCV(rf,
                         random_grid, 
                         scoring    = 'neg_mean_squared_error', 
                         cv         = kfold, 
                         n_jobs     = -1)

In [3]:
from datetime import datetime

# This function returns the time 
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
        
start_time = timer(None) # timing starts from this point for "start_time" variable
rf_search.fit(X_train,y_train)
timer(start_time) # timing ends here for "start_time" variable


 Time taken: 0 hours 13 minutes and 7.81 seconds.


In [4]:
rf_search.best_params_

{'criterion': 'gini',
 'max_depth': 5,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 50}

* Specify optimal model 

In [5]:
optimal_model = RandomForestClassifier(n_estimators     = 50,
                                      min_samples_split = 2,
                                      min_samples_leaf  = 1,
                                      max_features      = 'auto',
                                      max_depth         = 5,
                                      criterion         = 'gini').fit(X_train, y_train)