In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
df=pd.read_csv('/kaggle/input/santander-customer-satisfaction/train.csv')
df.shape

In [None]:
df.head()

In [None]:
X = df.drop(labels=['TARGET'],axis=1)
y = df['TARGET']

In [None]:
from sklearn.model_selection import train_test_split
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(labels=['TARGET'], axis=1),
    df['TARGET'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

**Lets apply the variance threshold**

Feature selector that removes all low-variance features.
This feature selection algorithm looks only at the features (X), not the desired outputs (y).

In [None]:
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(X_train)

In [None]:
var_thres.get_support()

In [None]:
### Finding non constant features
sum(var_thres.get_support())

In [None]:
# Lets Find non-constant features 
len(X_train.columns[var_thres.get_support()])

In [None]:
constant_columns = [column for column in X_train.columns
                    if column not in X_train.columns[var_thres.get_support()]]

print(len(constant_columns))

In [None]:
for column in constant_columns:
    print(column)

In [None]:
X_train.drop(constant_columns,axis=1)

In [None]:
#check for missing values
df.isnull().sum()

**Feature Selection- With Correlation**

Remove the features which are highly correlated



In [None]:
#import essential libraries
import matplotlib.pyplot as plt
import seaborn as sns

#Using Pearson Correlation
corrmat = X_train.corr()
fig, ax = plt.subplots()
fig.set_size_inches(12,12)
sns.heatmap(corrmat,cmap="CMRmap_r")

In [None]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(X_train, 0.9)
len(set(corr_features))

In [None]:
corr_features

In [None]:
X_train.drop(corr_features,axis=1)

**Univariate Selection
Statistical tests can be used to select those features that have the strongest relationship with the output variable.**

The scikit-learn library provides the SelectKBest class that can be used with a suite of different statistical tests to select a specific number of features.

We can use the chi-squared (chi²) statistical test for non-negative features to select 10 of the best features. 

**Feature Importance**

You can get the feature importance of each feature of your dataset by using the feature importance property of the model.

Feature importance gives you a score for each feature of your data, the higher the score more important or relevant is the feature towards your output variable.

Feature importance is an inbuilt class that comes with Tree Based Classifiers, we will be using Extra Tree Classifier for extracting the top 10 features for the dataset.

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)

In [None]:
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers

In [None]:
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

**Correlation Matrix with Heatmap
Correlation states how the features are related to each other or the target variable.**

Correlation can be positive (increase in one value of feature increases the value of the target variable) or negative (increase in one value of feature decreases the value of the target variable)

Heatmap makes it easy to identify which features are most related to the target variable, we will plot heatmap of correlated features using the seaborn library.

In [None]:
import seaborn as sns
#get correlations of each features in dataset
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn") 

**As always, I hope you find this kernel useful and your UPVOTES would be highly appreciated.**