# FILTER METHOD

The filter method is used for cleaning out constant, Quasi-constant and duplicated features

# Constant Features
Constant features are those that show the same value, just one value, for all the observations of the dataset. These feature ofthen rovides no information that allows a machine learning model to discriminate or predict a target

In [56]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

In [110]:
#import the data set
df = pd.read_csv("https://www.kaggle.com/c/bnp-paribas-cardif-claims-management/data?select=sample_submission.csv.zip")

BadZipFile: File is not a zip file

In [36]:
df.shape #371 columns

(50000, 371)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Columns: 371 entries, ID to TARGET
dtypes: float64(108), int64(263)
memory usage: 141.5 MB


In [38]:
df.columns

Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var33_hace2', 'saldo_medio_var33_hace3',
       'saldo_medio_var33_ult1', 'saldo_medio_var33_ult3',
       'saldo_medio_var44_hace2', 'saldo_medio_var44_hace3',
       'saldo_medio_var44_ult1', 'saldo_medio_var44_ult3', 'var38', 'TARGET'],
      dtype='object', length=371)

In [39]:
df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


### USING THE VARIANCE THRESHOLD FROM SKLEARN
variance threshold from sklearn is a simple baseline approach to feature selection. It removes all features which variance doesn't meet some threshold. By default it removes all zero-features, i.e features that have the same value in all samples

In [40]:
x = df.drop(labels=['TARGET'],axis=1)

In [41]:
y = df['TARGET']

In [42]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)


X_train.shape, X_test.shape

((35000, 370), (15000, 370))

In [43]:
model_thres = VarianceThreshold(threshold=0)
model_thres.fit(X_train)
sum(model_thres.get_support()) # indicates which features are retained after feature trimming i.e 312 features retained

312

In [46]:
X_train = model_thres.transform(X_train)
X_test = model_thres.transform(X_test)

X_train.shape, X_test.shape   # 58 features with constant varaibles were removed

((35000, 312), (15000, 312))

### Coding it ourselves (i.e without SKlearn VarianceThreshold)

In [57]:
data = pd.read_csv('santander.csv', nrows=50000) #importing data

# spliting the data to x and y dataframe
x = df.drop(labels=['TARGET'],axis=1) 
y = df['TARGET']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) #splitting to train and test data


X_train.shape, X_test.shape


((35000, 370), (15000, 370))

In [58]:
# where constant_feature is a list of all the features with constant variables
constant_features = [feature for feature in X_train.columns 
    if X_train[feature].std() == 0]

In [59]:
# droping this features from the Train and Test data sets
X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((35000, 312), (15000, 312))

### Removing constant features for categorical variable

In [75]:
data = pd.read_csv('santander.csv', nrows=50000) #importing data

# spliting the data to x and y dataframe
x = data.drop(labels=['TARGET'],axis=1) 
y = data['TARGET']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) #splitting to train and test data


X_train.shape, X_test.shape


((35000, 370), (15000, 370))

In [76]:
# converting all columns in the data set to objects (just for this work)
X_train = X_train.astype('O')
X_train.dtypes

ID                         object
var3                       object
var15                      object
imp_ent_var16_ult1         object
imp_op_var39_comer_ult1    object
                            ...  
saldo_medio_var44_hace2    object
saldo_medio_var44_hace3    object
saldo_medio_var44_ult1     object
saldo_medio_var44_ult3     object
var38                      object
Length: 370, dtype: object

In [77]:
# where constant_feature is a list of all the features with constant variables
constant_features = [feature for feature in X_train.columns 
    if len(X_train[feature].unique()) == 1] # 1 indicates only 1 category is been available

# droping this features from the Train and Test data sets
X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((35000, 312), (15000, 312))

same as before 58 features were removed

# Quasi-constant Features
Quasi-constant features are thoose that show the same value for the great majority of the observation of the dataset.

In [78]:
data.shape

(50000, 371)

In [79]:
data.isnull().sum() #checking for missing data

ID                         0
var3                       0
var15                      0
imp_ent_var16_ult1         0
imp_op_var39_comer_ult1    0
                          ..
saldo_medio_var44_hace3    0
saldo_medio_var44_ult1     0
saldo_medio_var44_ult3     0
var38                      0
TARGET                     0
Length: 371, dtype: int64

In [89]:
# spliting the data to x and y dataframe
x = data.drop(labels=['TARGET'],axis=1) 
y = data['TARGET']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) #splitting to train and test data


X_train.shape, X_test.shape


((35000, 370), (15000, 370))

In [90]:
# removing constant features as done in the previous section
# where constant_feature is a list of all the features with constant variables
constant_features = [feature for feature in X_train.columns 
    if X_train[feature].std() == 0] # 1 indicates only 1 category is been available

# droping this features from the Train and Test data sets
X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((35000, 312), (15000, 312))

### Using variance threshold from sklearn

In [91]:
model_quasi = VarianceThreshold(threshold=0.01) # i.e 99% of observations approximately 
model_quasi.fit(X_train)

VarianceThreshold(threshold=0.01)

In [92]:
# printing out the quasi constant features
print(
    len([
        x for x in X_train.columns
        if x not in X_train.columns[model_quasi.get_support()]
    ]))
[x for x in X_train.columns if x not in X_train.columns[model_quasi.get_support()]]

50


['ind_var1',
 'ind_var6_0',
 'ind_var6',
 'ind_var14',
 'ind_var17_0',
 'ind_var17',
 'ind_var18_0',
 'ind_var18',
 'ind_var19',
 'ind_var20_0',
 'ind_var20',
 'ind_var29_0',
 'ind_var29',
 'ind_var30_0',
 'ind_var31_0',
 'ind_var31',
 'ind_var32_cte',
 'ind_var32_0',
 'ind_var32',
 'ind_var33_0',
 'ind_var33',
 'ind_var40',
 'ind_var39',
 'ind_var44_0',
 'ind_var44',
 'num_var6_0',
 'num_var6',
 'num_var18_0',
 'num_var18',
 'num_op_var40_hace3',
 'num_var29_0',
 'num_var29',
 'num_var33',
 'ind_var7_emit_ult1',
 'ind_var7_recib_ult1',
 'num_aport_var17_hace3',
 'num_aport_var33_hace3',
 'num_aport_var33_ult1',
 'num_var7_emit_ult1',
 'num_meses_var17_ult3',
 'num_meses_var29_ult3',
 'num_meses_var33_ult3',
 'num_meses_var44_ult3',
 'num_reemb_var13_ult1',
 'num_trasp_var17_in_hace3',
 'num_trasp_var17_in_ult1',
 'num_trasp_var17_out_ult1',
 'num_trasp_var33_in_hace3',
 'num_trasp_var33_in_ult1',
 'num_trasp_var33_out_ult1']

In [93]:
# checking the observation trimmed out
X_train['ind_var31'].value_counts() / np.float(len(X_train))

0    0.996486
1    0.003514
Name: ind_var31, dtype: float64

we have 0 with a count of 99.6% on observation and 1 with a count of 0.3% on observation, using a threshold would trim out such features from the dataset

In [94]:
X_train = model_quasi.transform(X_train)
X_test = model_quasi.transform(X_test)

In [95]:
X_train.shape, X_test.shape

((35000, 262), (15000, 262))

# Duplicated Features
Often datasets contain one or more features that show the same values across all the observations. This mean that both features are in essemce identical. Identifying and removing duplicated, and therefore redundant features, is an easy first step towards feature selection and more easily interppretable machine learning models

In [97]:
data = pd.read_csv('santander.csv', nrows=15000) # we reduce the number of observations because this process is computationally expensive

# spliting the data to x and y dataframe
x = data.drop(labels=['TARGET'],axis=1) 
y = data['TARGET']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) #splitting to train and test data


X_train.shape, X_test.shape


((10500, 370), (4500, 370))

In [99]:
data_t = X_train.T
data_t.head()

Unnamed: 0,10439,9236,818,11504,11722,5276,6863,13463,10228,11462,...,4373,7891,9225,14019,4859,13123,3264,9845,10799,2732
ID,20941.0,18583.0,1623.0,23060.0,23512.0,10564.0,13779.0,26969.0,20502.0,22981.0,...,8783.0,15901.0,18564.0,28142.0,9723.0,26306.0,6557.0,19796.0,21653.0,5441.0
var3,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
var15,23.0,39.0,22.0,23.0,37.0,23.0,27.0,43.0,23.0,27.0,...,23.0,24.0,33.0,45.0,24.0,37.0,24.0,38.0,28.0,23.0
imp_ent_var16_ult1,0.0,0.0,150.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
imp_op_var39_comer_ult1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
 data_t.duplicated().sum() #using the pandas duplicated funtion to identify the duplicated columns and calling sum funtion on it to give the total number of duplicated columns

105

In [106]:
duplicated_features = data_t[data_t.duplicated()] # visualizing the duplicated columns 
duplicated_features

Unnamed: 0,10439,9236,818,11504,11722,5276,6863,13463,10228,11462,...,4373,7891,9225,14019,4859,13123,3264,9845,10799,2732
ind_var2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ind_var13_medio_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ind_var13_medio,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ind_var18_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ind_var18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
saldo_medio_var13_medio_hace2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
saldo_medio_var13_medio_hace3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
saldo_medio_var13_medio_ult1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
saldo_medio_var13_medio_ult3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [107]:
# capturing the features of the duplicated values by capturing the index vslues
duplicated_features = duplicated_features.index.values
duplicated_features

array(['ind_var2', 'ind_var13_medio_0', 'ind_var13_medio', 'ind_var18_0',
       'ind_var18', 'ind_var26', 'ind_var25', 'ind_var27_0',
       'ind_var28_0', 'ind_var28', 'ind_var27', 'ind_var29_0',
       'ind_var29', 'ind_var32', 'ind_var34_0', 'ind_var34', 'ind_var37',
       'ind_var40_0', 'ind_var40', 'ind_var41', 'ind_var39', 'ind_var44',
       'ind_var46_0', 'ind_var46', 'num_var13_medio_0', 'num_var13_medio',
       'num_var18_0', 'num_var18', 'num_var26', 'num_var25',
       'num_var27_0', 'num_var28_0', 'num_var28', 'num_var27',
       'num_var29_0', 'num_var29', 'num_var32', 'num_var34_0',
       'num_var34', 'num_var37', 'num_var40_0', 'num_var40', 'num_var41',
       'num_var39', 'num_var46_0', 'num_var46', 'saldo_var13_medio',
       'saldo_var18', 'saldo_var28', 'saldo_var27', 'saldo_var29',
       'saldo_var34', 'saldo_var40', 'saldo_var41', 'saldo_var46',
       'delta_imp_amort_var18_1y3', 'delta_imp_amort_var34_1y3',
       'delta_imp_reemb_var17_1y3', 'delta_imp_ree

In [108]:
# removing the duplicated features and transposing back
data_unique = data_t.drop_duplicates(keep='first').T
data_unique.shape

(10500, 265)