## 1. Dropping constants using Variance Threshold method

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.DataFrame({'A':[8,5,7,2,6],
                    'B':[0,0,0,0,0],'C':[8,5,3,7,4],'D':[1,1,1,1,1]})
data

Unnamed: 0,A,B,C,D
0,8,0,8,1
1,5,0,5,1
2,7,0,3,1
3,2,0,7,1
4,6,0,4,1


 **Identifying the features with low variance** 

In [3]:
import sklearn
from sklearn.feature_selection import VarianceThreshold

In [4]:
var_td = VarianceThreshold(threshold=0.0)

In [5]:
var_td.fit(data)

VarianceThreshold()

In [6]:
var_td.get_support()

array([ True, False,  True, False])

In [7]:
data.columns[var_td.get_support()]

Index(['A', 'C'], dtype='object')

In [8]:
con_col = [column for column in data.columns 
           if column not in data.columns[var_td.get_support()]]


In [9]:
con_col

['B', 'D']

**Dropping the features with low variance**

In [10]:
data.drop(con_col,axis=1)

Unnamed: 0,A,C
0,8,8
1,5,5
2,7,3
3,2,7
4,6,4


## Working with bigger dataset


In [11]:
import pandas as pd

In [12]:
df = pd.read_csv("santander-customer-satisfaction.csv",nrows=10000)

In [13]:
df.shape

(10000, 371)

In [14]:
df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [15]:
X = df.drop(df[['TARGET']],axis=1)


In [16]:
Y = df['TARGET']


In [17]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

In [18]:
x_train ,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=0)

In [None]:
var_td = VarianceThreshold(threshold=0)

In [None]:
var_td.fit(x_train)

In [None]:
sum(var_td.get_support())

In [None]:
con_col = [column for column in x_train.columns
          if column not in x_train.columns[var_td.get_support()]]

In [None]:
len(con_col)

In [None]:
df.drop(con_col,axis = 1)

## 2. Dropping constants using Correlation method

In [None]:
import pandas as pd
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = load_boston()
type(data)

In [None]:
df  = pd.DataFrame(data.data, columns=data.feature_names)
df['target']= data.target

In [None]:
data.feature_names


In [None]:
df.head()

In [None]:
X = df.drop(df[['target']],axis =1)

In [None]:
X

In [None]:
Y = df['target']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size =0.3)

In [None]:
cor = x_train.corr()

In [None]:
type(cor)

In [None]:
cor

In [None]:
import seaborn as sns
plt.figure(figsize=(12,10))
sns.heatmap(cor,cmap=plt.cm.afmhot_r)
plt.show()

**Cannot find easily from the heatmap or corr matrix , so using for loop & function to find the correlated features**

In [None]:
corr_matrix = x_train.corr()
cor_features = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if (corr_matrix.iloc[i,j])>0.7:
            col = corr_matrix.columns[i]
            cor_features.add(col)


In [None]:
cor_features

In [None]:
x_train.drop(cor_features,axis = 1)

In [None]:
x_test.drop(cor_features,axis = 1)

In [None]:
x_test.drop()