In [1]:
print("RAMESH BHUTKA")
print("SAP ID:-53004190003")

RAMESH BHUTKA
SAP ID:-53004190003


In [2]:
import datetime
print(datetime.datetime.now())

2021-01-27 18:23:17.343379


### Feature Selection

Feature selection is a process where you automatically select those features in your data that contribute most to the prediction variable or output in which you are interested.

Having irrelevant features in your data can decrease the accuracy of many models, especially linear algorithms like linear and logistic regression.

Three benefits of performing feature selection before modeling your data are:

    - Reduces Overfitting: Less redundant data means less opportunity to make decisions based on noise.
    - Improves Accuracy: Less misleading data means modeling accuracy improves.
    - Reduces Training Time: Less data means that algorithms train faster.


In [3]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/Ramesh-Bhutka/Feature-Selection-PCA/main/employee.csv")
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,department,salary,left
0,0.38,0.53,2,157,3,0,0,sales,low,1
1,0.8,0.86,5,262,6,0,0,sales,medium,1
2,0.11,0.88,7,272,4,0,0,sales,medium,1
3,0.72,0.87,5,223,5,0,0,sales,low,1
4,0.37,0.52,2,159,3,0,0,sales,low,1


In [4]:
df.isnull().any()

satisfaction_level       False
last_evaluation          False
number_project           False
average_montly_hours     False
time_spend_company       False
Work_accident            False
promotion_last_5years    False
department               False
salary                   False
left                     False
dtype: bool

In [5]:
df.department.unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [6]:
# By looking at columns — ‘salary’and ‘v’ are in string format, 
# so we can work on converting them to numerical values using Label Encoding

In [7]:
# Import label encoder
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 
# Encode labels in column 'salary'. 
df['salary']= label_encoder.fit_transform(df['salary']) 
df['salary'].unique()

array([1, 2, 0])

In [8]:
# Import label encoder
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 
# Encode labels in column 'department'. 
df['department']= label_encoder.fit_transform(df['department']) 
df['department'].unique()

array([7, 2, 3, 9, 8, 4, 0, 6, 5, 1])

In [9]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,department,salary,left
0,0.38,0.53,2,157,3,0,0,7,1,1
1,0.8,0.86,5,262,6,0,0,7,2,1
2,0.11,0.88,7,272,4,0,0,7,2,1
3,0.72,0.87,5,223,5,0,0,7,1,1
4,0.37,0.52,2,159,3,0,0,7,1,1


## Recursive Feature Elimination

In [10]:
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

array = df.values
X = array[:,0:-1]
Y = array[:,-1]
# feature extraction
model = LogisticRegression(solver='lbfgs')
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Num Features: 3
Selected Features: [ True False False False False  True  True False False]
Feature Ranking: [1 2 3 7 4 1 1 5 6]


In [11]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,department,salary,left
0,0.38,0.53,2,157,3,0,0,7,1,1
1,0.8,0.86,5,262,6,0,0,7,2,1
2,0.11,0.88,7,272,4,0,0,7,2,1
3,0.72,0.87,5,223,5,0,0,7,1,1
4,0.37,0.52,2,159,3,0,0,7,1,1


## Principal Component Analysis

In [12]:
# Feature Extraction with PCA
import numpy
from pandas import read_csv
from sklearn.decomposition import PCA
array = df.values
X = array[:,0:-1]
Y = array[:,-1]
# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
# summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_,"\n")
print(fit.components_)
print("Num Features: %d" % fit.n_features_)


Explained Variance: [9.95131084e-01 3.28339623e-03 8.65960462e-04] 

[[-1.00022995e-04  1.16454651e-03  1.03016986e-02  9.99939230e-01
   3.73902926e-03 -7.14276287e-05 -1.02221195e-05  2.25501578e-04
   8.87666666e-05]
 [-3.19949973e-04 -3.70121529e-04 -3.40540543e-03  2.14232477e-04
   1.25008899e-02 -4.28237002e-04  1.39931120e-03 -9.99914836e-01
  -1.58623964e-04]
 [-2.15588264e-02  1.54140126e-02  2.72953052e-01 -6.42994159e-03
   9.61555497e-01  7.64937488e-04  6.22459074e-03  1.11001142e-02
  -1.18173925e-03]]
Num Features: 9
