In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

In [3]:
data = pd.read_csv('santander.csv', nrows = 20000)
data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [4]:
X = data.drop('TARGET', axis = 1)
y = data['TARGET']

X.shape, y.shape

((20000, 370), (20000,))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

### Remove Constant, Quasi Constant and Duplicate Features

In [6]:
#remove constant and quasi constant features
constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(X_train)
X_train_filter = constant_filter.transform(X_train)
X_test_filter = constant_filter.transform(X_test)

In [7]:
X_train_filter.shape, X_test_filter.shape

((16000, 245), (4000, 245))

In [8]:
#remove duplicate features
X_train_T = X_train_filter.T
X_test_T = X_test_filter.T

In [9]:
X_train_T = pd.DataFrame(X_train_T)
X_test_T = pd.DataFrame(X_test_T)

In [10]:
X_train_T.duplicated().sum()

18

In [11]:
duplicated_features = X_train_T.duplicated()

In [12]:
features_to_keep = [not index for index in duplicated_features]

X_train_unique = X_train_T[features_to_keep].T
X_test_unique = X_test_T[features_to_keep].T

In [13]:
scaler = StandardScaler().fit(X_train_unique)
X_train_unique = scaler.transform(X_train_unique)
X_test_unique = scaler.transform(X_test_unique)

In [14]:
X_train_unique = pd.DataFrame(X_train_unique)
X_test_unique = pd.DataFrame(X_test_unique)

In [15]:
X_train_unique.shape, X_test_unique.shape

((16000, 227), (4000, 227))

### Removal of correlated Feature 

In [16]:
corrmat = X_train_unique.corr()

In [17]:
#find correlated features
def get_correlation(data, threshold):
    corr_col = set()
    corrmat = data.corr()
    for i in range(len(corrmat.columns)):
        for j in range(i):
            if abs(corrmat.iloc[i, j]) > threshold:
                colname = corrmat.columns[i]
                corr_col.add(colname)
    return corr_col

corr_features = get_correlation(X_train_unique, 0.70)
print('correlated features: ', len(set(corr_features)) )    

correlated features:  148


In [18]:
X_train_uncorr = X_train_unique.drop(labels=corr_features, axis = 1)
X_test_uncorr = X_test_unique.drop(labels = corr_features, axis = 1)

In [19]:
X_train_uncorr.shape, X_test_uncorr.shape

((16000, 79), (4000, 79))

## Feature Dimention Reduction by LDA or Is it a Classifier

In [20]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [21]:
lda = LDA(n_components=1)
X_train_lda = lda.fit_transform(X_train_uncorr, y_train)
X_test_lda = lda.transform(X_test_uncorr)

In [22]:
X_train_lda.shape, X_test_lda.shape

((16000, 1), (4000, 1))

In [23]:
def run_randomForest(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy on test set: ')
    print(accuracy_score(y_test, y_pred))

In [24]:
%%time
run_randomForest(X_train_lda, X_test_lda, y_train, y_test)

Accuracy on test set: 
0.93025
CPU times: total: 2.48 s
Wall time: 583 ms


In [25]:
%%time
run_randomForest(X_train_uncorr, X_test_uncorr, y_train, y_test)

Accuracy on test set: 
0.9585
CPU times: total: 1.59 s
Wall time: 502 ms


In [26]:
%%time
run_randomForest(X_train, X_test, y_train, y_test)

Accuracy on test set: 
0.9585
CPU times: total: 4.88 s
Wall time: 963 ms


### Feature Reduction by PCA? 

In [27]:
from sklearn.decomposition import PCA

In [28]:
pca = PCA(n_components=2, random_state=42)
pca.fit(X_train_uncorr)

In [29]:
X_train_pca = pca.transform(X_train_uncorr)
X_test_pca = pca.transform(X_test_uncorr)
X_train_pca.shape, X_test_pca.shape

((16000, 2), (4000, 2))

In [30]:
%%time
run_randomForest(X_train_pca, X_test_pca, y_train, y_test)

Accuracy on test set: 
0.956
CPU times: total: 1.95 s
Wall time: 519 ms


In [31]:
%%time
run_randomForest(X_train, X_test, y_train, y_test)

Accuracy on test set: 
0.9585
CPU times: total: 5.23 s
Wall time: 1.13 s


In [32]:
X_train_uncorr.shape

(16000, 79)

In [33]:
for component in range(1,30):
    pca = PCA(n_components=component, random_state=42)
    pca.fit(X_train_uncorr)
    X_train_pca = pca.transform(X_train_uncorr)
    X_test_pca = pca.transform(X_test_uncorr)
    print('Selected Components: ', component)
    run_randomForest(X_train_pca, X_test_pca, y_train, y_test)
    print()

Selected Components:  1
Accuracy on test set: 
0.92375

Selected Components:  2
Accuracy on test set: 
0.956

Selected Components:  3
Accuracy on test set: 
0.95675

Selected Components:  4
Accuracy on test set: 
0.95825

Selected Components:  5
Accuracy on test set: 
0.9575

Selected Components:  6
Accuracy on test set: 
0.95725

Selected Components:  7
Accuracy on test set: 
0.9565

Selected Components:  8
Accuracy on test set: 
0.9565

Selected Components:  9
Accuracy on test set: 
0.9555

Selected Components:  10
Accuracy on test set: 
0.9565

Selected Components:  11
Accuracy on test set: 
0.95575

Selected Components:  12
Accuracy on test set: 
0.95725

Selected Components:  13
Accuracy on test set: 
0.957

Selected Components:  14
Accuracy on test set: 
0.95575

Selected Components:  15
Accuracy on test set: 
0.957

Selected Components:  16
Accuracy on test set: 
0.95625

Selected Components:  17
Accuracy on test set: 
0.9575

Selected Components:  18
Accuracy on test set: 
0.95