# Import Module

In [154]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import joblib
import warnings
import sys
import os
print(os.listdir('../dataset/preprocessing'))
print(os.listdir('../model'))
warnings.filterwarnings('ignore')
sys.path.append('../code')

from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb
from xgboost import XGBClassifier

# Custom Module
from dataset.load import Load_Data
from preprocessing.preprocessing import DeriveFeature, Preprocess
from modeling.Validation import KFoldValidation
from modeling.HyperParameterTuning import RandomForestEvaluation, XGBEvaluation

['Preprocess_test.csv', 'Preprocess_train.csv', 'test.csv', 'train.csv']
['XGB_0.731', 'XGB_0.732', 'XGB_0.734', 'XGB_0.734_10', 'XGB_0.735', 'XGB_0.809_15']


# Load Model & Dataset

In [5]:
DATA_PATH = '../dataset'
MODEL_PATH = '../model'

## Data

In [7]:
train, test = Load_Data(DATA_PATH, minute = 10, return_test = True, split_size = 0.25)
X_train, X_test, y_train, y_test = Preprocess(train, test, scaling = False, just_colnames = False)
col_names = Preprocess(train, test, scaling = False, just_colnames = True)

Train Dataset Shape:  (19806, 50)
Test Dataset Shape:  (6603, 50)
Preprocessed Train Dataset Shape: (19693, 58)
Preprocessed Test Dataset Shape: (6571, 58)
Just allocating Columns Name.


## Model

In [8]:
pre_train_xgb = joblib.load(os.path.join(MODEL_PATH, 'XGB_0.734'))
pre_train_xgb

XGBClassifier(bagging_fraction=0.5504217949108876, base_score=0.5,
              booster='gbtree', colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.8510042690715668, eval_metric='error',
              feature_fraction=0.6859541749154623, gamma=0.6148753896468939,
              learning_rate=0.1078166067590229, max_delta_step=0, max_depth=4,
              min_child_samples=10, min_child_weight=1, missing=nan,
              n_estimators=100, n_jobs=1, nthread=None, num_leaves=40,
              objective='binary:logistic', random_state=42,
              reg_alpha=0.3835375604344825, reg_lambda=0.6052040861088117,
              scale_pos_weight=1, seed=None, silent=None, subsample=0.5,
              verbosity=1)

# Feature Selection

## Recursive Feature Elimination

In [16]:
rfe = RFE(pre_train_xgb, 20, step = 10, verbose = 1)

fit = rfe.fit(X_train, y_train)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Fitting estimator with 58 features.
Fitting estimator with 48 features.
Fitting estimator with 38 features.
Fitting estimator with 28 features.
Num Features: 20
Selected Features: [ True False  True  True False False False False False False False False
 False  True  True  True False False False False  True False  True False
 False False False False  True False False False  True  True  True False
 False  True False False  True  True False  True  True  True False False
  True  True False False False False False False False False]
Feature Ranking: [1 3 1 1 2 2 5 3 3 4 3 4 5 1 1 1 3 3 4 4 1 4 1 3 2 2 5 5 1 2 3 4 1 1 1 2 2
 1 4 3 1 1 3 1 1 1 4 4 1 1 5 5 5 5 2 5 4 5]


In [131]:
tmp = np.array([0.83,0.82,0.77,0.65,0.5])
tmp_index = np.where(tmp == max(tmp))
tmp[tmp_index]

array([0.83])

In [107]:
tmp[np.where(tmp == max(tmp))]

array([0.83])

In [150]:
class FeatureSelection:
    def __init__(self, model, train_features, train_label, test_features, test_label):
        self.model = model
        self.X_train = train_features
        self.y_train = train_label
        self.X_test = test_features
        self.y_test = test_label
        
    def RecursiveFeatureElimination(self, n_features, step):
        accuracy_list = []
        
        if type(n_features) == list:
            for i in n_features:
                print("Feature Selection Algorithm Start... \nNumber of Features: {} | Elimination Step: {}".format(i, step))
                rfe_selector = RFE(self.model, i, step = step, verbose = 1)
                rfe_model = rfe_selector.fit(self.X_train, self.y_train)
                pred = rfe_selector.predict(self.X_test)
                
                acc_score = accuracy_score(self.y_test, pred)
                accuracy_list.append(acc_score)
                print("Accuracy Score: ", acc_score)
            
            accuracy_array = np.array(accuracy_list)
            max_accuracy_index = int(np.where(accuracy_array == max(accuracy_array))[0])
            print("\nMax Accuracy Score: {} | Number of Features: {}".format(max(accuracy_array), n_features[max_accuracy_index]))
            
    def RecursiveFeatureEliminationCV(self, step, cv):
        self.step = step
        self.cv = cv
        
        rfecv_selector = RFECV(self.model, step = self.step, cv = self.cv)
        rfecv_model = rfecv_selector.fit(self.X_train, self.y_train)
        
        print("Num Features: %d" % rfecv_model.n_features_)
        print("Selected Features: %s" % rfecv_model.support_)
        print("Feature Ranking: %s" % rfecv_model.ranking_)

In [None]:
fs = FeatureSelection(pre_train_xgb, X_train, y_train, X_test, y_test)
fs.RecursiveFeatureElimination(n_features = [10, 20, 30, 40, 50, 58], step = 10)

Feature Selection Algorithm Start... 
Number of Features: 10 | Elimination Step: 10
Fitting estimator with 58 features.
Fitting estimator with 48 features.
Fitting estimator with 38 features.
Fitting estimator with 28 features.
Fitting estimator with 18 features.
Accuracy Score:  0.7297215035763202
Feature Selection Algorithm Start... 
Number of Features: 20 | Elimination Step: 10
Fitting estimator with 58 features.
Fitting estimator with 48 features.
Fitting estimator with 38 features.
Fitting estimator with 28 features.
Accuracy Score:  0.7338304672043829
Feature Selection Algorithm Start... 
Number of Features: 30 | Elimination Step: 10
Fitting estimator with 58 features.


In [155]:
fs.RecursiveFeatureEliminationCV(step = 5, cv = 5)

Num Features: 58
Selected Features: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True]
Feature Ranking: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
