## Implementing hyperparameter tuning 

In [4]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [5]:
path="../../Data Processing/Data/"
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
targets = pd.read_csv('target.csv')

In [6]:
# Since SMOTE works on numerical data only, we will first encode the categorical variables in this case 

In [7]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categorical)
encoded_categorical = encoder.transform(categorical).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)

In [8]:
data = pd.concat([numerical, encoded_categorical, targets], axis = 1)

In [9]:
regression_target = data['TARGET_D']
# data.head()
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)

In [10]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis=1)
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

0    90569
1    90569
Name: TARGET_B, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.20, random_state=0)

In [12]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [13]:
y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

In [14]:
# Now we can remove the column target d from the set of features 
X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

## Grid Search

In [15]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100,500],
    'min_samples_split': [2, 4],
    'min_samples_leaf' : [1, 2],
    'max_features': ['sqrt']
    ##'max_samples' : ['None', 0.5],
    ##'max_depth':[3,5,10],
    ## 'bootstrap':[True,False] 
    }
clf = RandomForestClassifier(random_state=100)

In [16]:
grid_search = GridSearchCV(clf, param_grid, cv=5,return_train_score=True,n_jobs=-1,)

In [None]:
grid_search.fit(X_train,y_train)



In [None]:
grid_search.best_params_ #To check the best set of parameters returned

In [None]:
pd.DataFrame(grid_search.cv_results_)

<b> Please check RandomSearch as another algorithm comparable to GridSearch

## using the above results 

In [None]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(random_state=0, max_features='sqrt', 
                             min_samples_leaf=1, min_samples_split=2, n_estimators=100)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(np.mean(cross_val_scores))

## Feature Importance

<b> Higher the score, the more important the feature is

In [None]:
clf.fit( X_train, y_train)

In [None]:
len(X_train.columns)

In [None]:
feature_names = X_train.columns
feature_names = list(feature_names)

In [None]:
df = pd.DataFrame(list(zip(feature_names, clf.feature_importances_)))
df.columns = ['columns_name', 'score_feature_importance']
df.sort_values(by=['score_feature_importance'], ascending = False)

In [None]:
clf.feature_importances_