In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# this classifier usually outperforms most off the shelf classifier
from sklearn.ensemble import GradientBoostingClassifier
# metric for optimization
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

# some methods we need to work with imbalanced data are sensitive to the magnitude of features
# such as KNN
from sklearn.preprocessing import MinMaxScaler

# reduce no. of features
from feature_engine.selection import (DropDuplicateFeatures,
                                     DropConstantFeatures)

# over sampling
from imblearn.over_sampling import (RandomOverSampler, SMOTENC)

# under sampling

from imblearn.under_sampling import (InstanceHardnessThreshold,
                                    RandomUnderSampler)

# ensemble methods with boosting which tend to work better

from imblearn.ensemble import (RUSBoostClassifier,
                              EasyEnsembleClassifier)


In [2]:
# load Santander Customer Satisfaction dataset
data = pd.read_csv("..\\train.csv")

### Variable Exploration

In [3]:
# Check for missing values
nullCol=[]

for i in data.columns:
    if data[i].isnull().sum()>0:
        append.nullCol

print(nullCol)

[]


In [4]:
#put in a list any column with strings
list(data.select_dtypes(include='object').columns.values)

[]

In [5]:
# check the dataset if how many are binary of have <10 or <20 unique variables

for unique in [2,10,20]:
    vars_ = [x for x in data.columns if data[x].nunique()<=unique]
    vars_ = len(vars_)
    print(f'{vars_} variables with less than or equal to {unique} values')

140 variables with less than or equal to 2 values
239 variables with less than or equal to 10 values
254 variables with less than or equal to 20 values


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['ID', 'TARGET'],axis=1), data['TARGET'], test_size=0.33,)

X_train.shape, X_test.shape

((50933, 369), (25087, 369))

In [7]:
# Check the imbalance, 0 is satisfied, 1 is not satisfied

y_train.value_counts(normalize=True), y_train.value_counts()

(0    0.960929
 1    0.039071
 Name: TARGET, dtype: float64,
 0    48943
 1     1990
 Name: TARGET, dtype: int64)

### Drop constant, quasi-constant and duplicated features

In [8]:
pipe = Pipeline([('constant', DropConstantFeatures(tol=1)),
                ('duplicated',DropDuplicateFeatures())])

pipe.fit(X_train, y_train)

Pipeline(steps=[('constant', DropConstantFeatures()),
                ('duplicated', DropDuplicateFeatures())])

In [9]:
# see how many constant features in the dataset
len(pipe.named_steps['constant'].features_to_drop_)

49

In [10]:
# see number of duplicated features are in the dataset
len(pipe.named_steps['duplicated'].features_to_drop_)

25

In [11]:
# go ahead and remove all duplicated and constant features
print('No. of features before drop: ', X_train.shape[1])

X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

print('No. of features after drop: ', X_train.shape[1])

No. of features before drop:  369
No. of features after drop:  295
