<a href="https://colab.research.google.com/github/VighneshAlevoor/ML-Feature-Engineering/blob/master/Santander_customer_satisfaction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import variance_threshold



In [2]:
df=pd.read_csv('train.csv', nrows=30000)
df.head()
df.shape

(30000, 371)

In [3]:
x_train,x_test,y_train,y_test=train_test_split(df.drop(['TARGET'], axis=1),
                                               df['TARGET'], test_size=0.333333,random_state=0)
print(x_train.shape)
print(x_test.shape)

(20000, 370)
(10000, 370)


In [4]:
#remove constant columns
const_feat= [feat for feat in x_train.columns if x_train[feat].std()==0]
len(const_feat)

64

In [5]:
x_train.drop(labels=const_feat,axis=1, inplace=True)
x_test.drop(labels=const_feat,axis=1, inplace=True)
print(x_train.shape)
print(x_test.shape)

(20000, 306)
(10000, 306)


In [6]:
#remove Quasi or almost constant columns
quasi=[]
for i in x_train.columns:
  predomin= (x_train[i].value_counts()/np.float(len(x_train))).sort_values(ascending=False).values[0]
  if predomin>0.99:
    quasi.append(i)

len(quasi)



154

In [7]:
print(type(quasi))
print(type(x_train.columns))

<class 'list'>
<class 'pandas.core.indexes.base.Index'>


In [8]:
lst=[]
for x in x_train.columns:
  if x not in quasi:
    lst.append(x)
lst


['ID',
 'var3',
 'var15',
 'imp_ent_var16_ult1',
 'imp_op_var39_comer_ult1',
 'imp_op_var39_comer_ult3',
 'imp_op_var41_comer_ult1',
 'imp_op_var41_comer_ult3',
 'imp_op_var41_efect_ult1',
 'imp_op_var41_efect_ult3',
 'imp_op_var41_ult1',
 'imp_op_var39_efect_ult1',
 'imp_op_var39_efect_ult3',
 'imp_op_var39_ult1',
 'ind_var1_0',
 'ind_var5_0',
 'ind_var5',
 'ind_var8_0',
 'ind_var8',
 'ind_var12_0',
 'ind_var12',
 'ind_var13_0',
 'ind_var13_corto_0',
 'ind_var13_corto',
 'ind_var13_largo_0',
 'ind_var13_largo',
 'ind_var13',
 'ind_var14_0',
 'ind_var24_0',
 'ind_var24',
 'ind_var25_cte',
 'ind_var26_0',
 'ind_var26_cte',
 'ind_var26',
 'ind_var25_0',
 'ind_var25',
 'ind_var30',
 'ind_var37_cte',
 'ind_var37_0',
 'ind_var37',
 'ind_var39_0',
 'ind_var40_0',
 'ind_var41_0',
 'num_var1_0',
 'num_var4',
 'num_var5_0',
 'num_var5',
 'num_var8_0',
 'num_var8',
 'num_var12_0',
 'num_var12',
 'num_var13_0',
 'num_var13_corto_0',
 'num_var13_corto',
 'num_var13_largo_0',
 'num_var13_largo',
 '

In [9]:
lst2=[]
for x in x_test.columns:
  if x not in quasi:
    lst2.append(x)
lst2

['ID',
 'var3',
 'var15',
 'imp_ent_var16_ult1',
 'imp_op_var39_comer_ult1',
 'imp_op_var39_comer_ult3',
 'imp_op_var41_comer_ult1',
 'imp_op_var41_comer_ult3',
 'imp_op_var41_efect_ult1',
 'imp_op_var41_efect_ult3',
 'imp_op_var41_ult1',
 'imp_op_var39_efect_ult1',
 'imp_op_var39_efect_ult3',
 'imp_op_var39_ult1',
 'ind_var1_0',
 'ind_var5_0',
 'ind_var5',
 'ind_var8_0',
 'ind_var8',
 'ind_var12_0',
 'ind_var12',
 'ind_var13_0',
 'ind_var13_corto_0',
 'ind_var13_corto',
 'ind_var13_largo_0',
 'ind_var13_largo',
 'ind_var13',
 'ind_var14_0',
 'ind_var24_0',
 'ind_var24',
 'ind_var25_cte',
 'ind_var26_0',
 'ind_var26_cte',
 'ind_var26',
 'ind_var25_0',
 'ind_var25',
 'ind_var30',
 'ind_var37_cte',
 'ind_var37_0',
 'ind_var37',
 'ind_var39_0',
 'ind_var40_0',
 'ind_var41_0',
 'num_var1_0',
 'num_var4',
 'num_var5_0',
 'num_var5',
 'num_var8_0',
 'num_var8',
 'num_var12_0',
 'num_var12',
 'num_var13_0',
 'num_var13_corto_0',
 'num_var13_corto',
 'num_var13_largo_0',
 'num_var13_largo',
 '

In [10]:
x_train=x_train[lst]
x_train.shape

(20000, 152)

In [11]:
x_test=x_test[lst2]
x_test.shape

(10000, 152)

In [12]:
#check for duplicate columns
dupli_col=[]
for i in range(0, len(x_train.columns)):
  col1= x_train.columns[i]
  for col2 in x_train.columns[i+1:]:
    if x_train[col1].equals(x_train[col2]):
      dupli_col.append(col2)

dupli_col

['ind_var40_0',
 'ind_var26',
 'ind_var25',
 'ind_var37',
 'num_var40_0',
 'num_var26',
 'num_var25',
 'num_var37']

In [16]:
x_train.drop(labels=dupli_col, axis=1,inplace=True)
x_test.drop(labels=dupli_col, axis=1,inplace=True)
x_train.shape,x_test.shape

((20000, 144), (10000, 144))