# 0.0 Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn import ensemble
from boruta import BorutaPy

## 0.1 Load data

In [2]:
train_data_preparation = catalog.load("train_data_preparation")
validation_data_preparation = catalog.load("validation_data_preparation")

## 0.2 Helper Functions

In [3]:
def notebook_settings():
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', train_data_preparation.shape[1])
    pd.set_option('display.float_format', lambda x: '%.3f' % x)

    return None

notebook_settings()

# 5.0 Feature Selection

In [4]:
cols_drop = ['show_id', 'rating']

# training dataset
y_train = train_data_preparation['rating']
x_train = train_data_preparation.drop(cols_drop, axis=1)

# validation dataset
y_val = validation_data_preparation['rating']
x_val = validation_data_preparation.drop(cols_drop, axis=1)

x_train.head()

Unnamed: 0,title,director,cast,country,release_year,listed_in,description,minutes,seasons
0,2268,1212,3029,0.346,0.167,169,447,-2.552,0.0
1,1762,77,1719,0.002,0.5,67,3100,0.1,0.0
2,2233,833,2721,0.166,-1.0,211,1088,0.566,0.0
3,1900,2419,593,0.346,0.167,156,678,-1.62,0.0
4,678,1053,1874,0.346,-0.5,179,2231,0.637,0.0


## 5.1 Boruta as feature selector

In [6]:
# training and test dataset for Boruta
X_train_boruta = x_train.values
y_train_boruta = y_train.values.ravel()

# define RandomForestRegressor
rf = ensemble.RandomForestRegressor(n_jobs=-1)

# define Boruta
boruta = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42, max_iter=70).fit(X_train_boruta, y_train_boruta)

Iteration: 	1 / 70
Confirmed: 	0
Tentative: 	9
Rejected: 	0
Iteration: 	2 / 70
Confirmed: 	0
Tentative: 	9
Rejected: 	0
Iteration: 	3 / 70
Confirmed: 	0
Tentative: 	9
Rejected: 	0
Iteration: 	4 / 70
Confirmed: 	0
Tentative: 	9
Rejected: 	0
Iteration: 	5 / 70
Confirmed: 	0
Tentative: 	9
Rejected: 	0
Iteration: 	6 / 70
Confirmed: 	0
Tentative: 	9
Rejected: 	0
Iteration: 	7 / 70
Confirmed: 	0
Tentative: 	9
Rejected: 	0
Iteration: 	8 / 70
Confirmed: 	3
Tentative: 	3
Rejected: 	3
Iteration: 	9 / 70
Confirmed: 	3
Tentative: 	3
Rejected: 	3
Iteration: 	10 / 70
Confirmed: 	3
Tentative: 	3
Rejected: 	3
Iteration: 	11 / 70
Confirmed: 	3
Tentative: 	3
Rejected: 	3
Iteration: 	12 / 70
Confirmed: 	3
Tentative: 	3
Rejected: 	3
Iteration: 	13 / 70
Confirmed: 	3
Tentative: 	3
Rejected: 	3
Iteration: 	14 / 70
Confirmed: 	3
Tentative: 	3
Rejected: 	3
Iteration: 	15 / 70
Confirmed: 	3
Tentative: 	3
Rejected: 	3
Iteration: 	16 / 70
Confirmed: 	3
Tentative: 	3
Rejected: 	3
Iteration: 	17 / 70
Confirmed: 	3

In [10]:
cols_selected = boruta.support_.tolist()

# best features
X_train_fs = x_train
cols_selected_boruta = X_train_fs.iloc[:, cols_selected].columns

# not selected boruta
cols_not_selected_boruta = list(np.setdiff1d(X_train_fs.columns, cols_selected_boruta))

In [11]:
catalog.save("train_feature_selection", train_data_preparation[list(cols_selected_boruta) + ['show_id', 'rating']])
catalog.save("validation_feature_selection", validation_data_preparation[list(cols_selected_boruta) + ['show_id', 'rating']])