In [1]:
# -----------------------------------------------------------------
# Implement Recursive Feature Elimination.
# Predict product purchase for the Bank Telemarketing dataset
# -----------------------------------------------------------------

# Import libraries
import pandas as pd

In [2]:
# Read the file
f = pd.read_csv('bank.csv')
f = f.drop("duration", axis = 1)

In [3]:
# Split the columns into Dependent (Y) and independent (X) features
x = f.iloc[:,:-1]
y = f.iloc[:, -1]

In [4]:
# Create dummy variables
x = pd.get_dummies(x, drop_first=True)
y = pd.get_dummies(y, drop_first=True)

In [5]:
# Split the dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = \
train_test_split(x, y, test_size = 0.3, random_state = 1234, stratify=y)

In [6]:
# Import Randon Forest Classifier
from sklearn.ensemble import RandomForestClassifier 

In [7]:
# Default Random Forest Object
rfc1 = RandomForestClassifier(random_state=1234)
rfc1.fit(X_train, Y_train)
Y_predict1 = rfc1.predict(X_test)

  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
# Score and Evaluate the model 
from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(Y_test, Y_predict1)
score1 = rfc1.score(X_test, Y_test)

In [9]:
# Apply Recursive Feature Elimination
from sklearn.feature_selection import RFE
rfc2 = RandomForestClassifier(random_state=1234)

In [10]:
# Create an RFE selector object using RFC as an estimator
rfe = RFE(estimator=rfc2, n_features_to_select=30, step=1)

In [14]:
# Fit the data to the rfe selector
rfe.fit(x, y)

  y = column_or_1d(y, warn=True)


RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                     criterion='gini', max_depth=None,
                                     max_features='auto', max_leaf_nodes=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=10, n_jobs=None,
                                     oob_score=False, random_state=1234,
                                     verbose=0, warm_start=False),
    n_features_to_select=30, step=1, verbose=0)

In [15]:
# Create new Train and Test datasets
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

In [16]:
# Fit the Random Forest classifier to the new train and test with 80 features
rfc2.fit(X_train_rfe, Y_train)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=1234,
                       verbose=0, warm_start=False)

In [17]:
# Test the model with new Test dataset
Y_predict = rfc2.predict(X_test_rfe)

In [18]:
# Score and Evaluate the new model 
from sklearn.metrics import confusion_matrix
cm_rfe = confusion_matrix(Y_test, Y_predict)
score_rfe = rfc2.score(X_test_rfe, Y_test)

In [19]:
# Get column names
columns = list(x.columns)

In [20]:
# Get the ranking of the features. Ranking 1 for selected features
ranking = rfe.ranking_

In [21]:
# Get the feature importance scores
feature_importance = rfc1.feature_importances_

In [22]:
# Create the dataframe of the Features selected, Ranking and their importance
rfe_selected = pd.DataFrame()
rfe_selected = pd.concat([pd.DataFrame(columns), 
                          pd.DataFrame(ranking),
                          pd.DataFrame(feature_importance)], axis=1)

rfe_selected.columns = ["Feature Name", "Ranking", "Feature Importance"]