In [10]:
%matplotlib inline

import pandas as pd
import numpy as np

hr_data = pd.read_csv('data/hr.csv', header=0)
hr_data.head()
hr_data = hr_data.dropna()
data_trnsf = pd.get_dummies(hr_data, columns =['salary', 'sales'])
data_trnsf.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'salary_high', 'salary_low', 'salary_medium',
       'sales_IT', 'sales_RandD', 'sales_accounting', 'sales_hr',
       'sales_management', 'sales_marketing', 'sales_product_mng',
       'sales_sales', 'sales_support', 'sales_technical'],
      dtype='object')

In [11]:
X= data_trnsf.drop('left', axis=1)
X.columns
Y = data_trnsf.left# feature extraction

In [21]:
#Variance Threshold

from sklearn.feature_selection import VarianceThreshold
# Set threshold to 0.1
select_features = VarianceThreshold(threshold = 0.2)
select_features.fit_transform(X)

# Subset features
X_subset = select_features.transform(X)

print('Number of features:', X.shape[1])
print('Reduced number of features:',X_subset.shape[1])


Number of features: 20
Reduced number of features: 5


In [14]:
#Chi2 Selector

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

chi2_model = SelectKBest(score_func=chi2, k=4)
X_best_feat = chi2_model.fit_transform(X, Y)
# selected features
print('Number of features:', X.shape[1])
print('Reduced number of features:',X_best_feat.shape[1])

Number of features: 20
Reduced number of features: 4


In [27]:
#Recursive Feature Elimination

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# create a base classifier used to evaluate a subset of attributes
logistic_model = LogisticRegression()
# create the RFE model and select 4 attributes
rfe = RFE(logistic_model, 4)
rfe = rfe.fit(X, Y)
# Ranking of the attributes
print(sorted(zip(map(lambda x: round(x, 4), rfe.ranking_),X)))

[ 1  7  9 17  6  1  1  1  3  8 15  2 12  5  4 13 16 14 11 10]
[(1, 'Work_accident'), (1, 'promotion_last_5years'), (1, 'salary_high'), (1, 'satisfaction_level'), (2, 'sales_RandD'), (3, 'salary_low'), (4, 'sales_management'), (5, 'sales_hr'), (6, 'time_spend_company'), (7, 'last_evaluation'), (8, 'salary_medium'), (9, 'number_project'), (10, 'sales_technical'), (11, 'sales_support'), (12, 'sales_accounting'), (13, 'sales_marketing'), (14, 'sales_sales'), (15, 'sales_IT'), (16, 'sales_product_mng'), (17, 'average_montly_hours')]


In [36]:
# Feature Importance
from sklearn.ensemble import RandomForestClassifier
# fit a RandomForest model to the data
model = RandomForestClassifier()
model.fit(X, Y)
# display the relative importance of each attribute
print(sorted(zip(map(lambda x: round(x, 4), model.feature_importances_),X)))

[(0.001, 'sales_product_mng'), (0.0012, 'sales_marketing'), (0.0014, 'promotion_last_5years'), (0.0015, 'sales_RandD'), (0.0015, 'sales_accounting'), (0.0017, 'sales_management'), (0.0019, 'sales_hr'), (0.002, 'sales_IT'), (0.0025, 'sales_support'), (0.0037, 'sales_technical'), (0.0038, 'salary_medium'), (0.0039, 'sales_sales'), (0.0071, 'salary_low'), (0.0073, 'salary_high'), (0.0121, 'Work_accident'), (0.1179, 'last_evaluation'), (0.1187, 'average_montly_hours'), (0.1543, 'number_project'), (0.2152, 'time_spend_company'), (0.3413, 'satisfaction_level')]
