In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# this classifier usually outperforms most off the shelf classifier
from sklearn.ensemble import GradientBoostingClassifier
# metric for optimization
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

# some methods we need to work with imbalanced data are sensitive to the magnitude of features
# such as KNN
from sklearn.preprocessing import MinMaxScaler

# reduce no. of features
from feature_engine.selection import (DropDuplicateFeatures,
                                     DropConstantFeatures)

# over sampling
from imblearn.over_sampling import (RandomOverSampler, SMOTENC)

# under sampling

from imblearn.under_sampling import (InstanceHardnessThreshold,
                                    RandomUnderSampler)

# ensemble methods with boosting which tend to work better

from imblearn.ensemble import (RUSBoostClassifier,
                              EasyEnsembleClassifier)


In [2]:
# load Santander Customer Satisfaction dataset
data = pd.read_csv("..\\train.csv")

In [42]:
data.shape

(76020, 371)

### Variable Exploration

In [3]:
# Check for missing values
nullCol=[]

for i in data.columns:
    if data[i].isnull().sum()>0:
        append.nullCol

print(nullCol)

[]


In [4]:
#put in a list any column with strings
list(data.select_dtypes(include='object').columns.values)

[]

In [5]:
# check the dataset if how many are binary of have <10 or <20 unique variables

for unique in [2,10,20]:
    vars_ = [x for x in data.columns if data[x].nunique()<=unique]
    vars_ = len(vars_)
    print(f'{vars_} variables with less than or equal to {unique} values')

140 variables with less than or equal to 2 values
239 variables with less than or equal to 10 values
254 variables with less than or equal to 20 values


This shows that we have 140 features that are binary. This is important for over sampling or under sampling methods for imbalanced datasets as some methods use distance metrics that are not suitable for discrete variables.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['ID', 'TARGET'],axis=1), data['TARGET'], test_size=0.8,) # test_size increased to prevent loss of memory 

X_train.shape, X_test.shape

((15204, 369), (60816, 369))

In [7]:
# Check the imbalance, 0 is satisfied, 1 is not satisfied

y_train.value_counts(normalize=True), y_train.value_counts()

(0    0.959024
 1    0.040976
 Name: TARGET, dtype: float64,
 0    14581
 1      623
 Name: TARGET, dtype: int64)

### Drop constant, quasi-constant and duplicated features

In [8]:
pipe = Pipeline([('constant', DropConstantFeatures(tol=1)),
                ('duplicated',DropDuplicateFeatures())])

pipe.fit(X_train, y_train)

Pipeline(steps=[('constant', DropConstantFeatures()),
                ('duplicated', DropDuplicateFeatures())])

In [9]:
# see how many constant features in the dataset
len(pipe.named_steps['constant'].features_to_drop_)

80

In [10]:
# see number of duplicated features are in the dataset
len(pipe.named_steps['duplicated'].features_to_drop_)

25

In [11]:
# go ahead and remove all duplicated and constant features
print('No. of features before drop: ', X_train.shape[1])

X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

print('No. of features after drop: ', X_train.shape[1])

No. of features before drop:  369
No. of features after drop:  264


### Gradient Boosting

Gradient boosting models outperform all off the shelf classification models.

In [12]:
gbm = GradientBoostingClassifier(loss='exponential',
                                 max_depth=1,
                                 min_samples_split= 0.80,
                                 n_estimators=100)

gbm.fit(X_train, y_train)

GradientBoostingClassifier(loss='exponential', max_depth=1,
                           min_samples_split=0.8)

In [13]:
# benchmark scores

X_train_pred = gbm.predict_proba(X_train)[:,1]
X_test_pred = gbm.predict_proba(X_test)[:,1]

print('Train roc_auc score', roc_auc_score(y_train, X_train_pred))
print('Test roc_auc score', roc_auc_score(y_test, X_test_pred))

Train roc_auc score 0.8314471888535873
Test roc_auc score 0.8260094052053799


## Handling Imbalance

In [14]:
# Using Instance Hardness Threshold to remove difficult to classify observations correctly from the majority class
# A measure of how difficult to clasify an observation correctly, it is inversely correlated to the probability of its class.

iht = InstanceHardnessThreshold(estimator=gbm, # gradient boosting classifier
                               sampling_strategy='auto',# undersamples only the majority class
                               cv=2) 

# resample
X_resampled, y_resampled = iht.fit_resample(X_train, y_train)

# shape of original data and resampled data
X_train.shape, X_resampled.shape

((15204, 264), (1389, 264))

In [15]:
# see resampled ratio, instance hardness is a fixed method and aims for 50:50
y_resampled.value_counts(normalize=True)

0    0.551476
1    0.448524
Name: TARGET, dtype: float64

In [16]:
# train the model on the resampled data

gbm.fit(X_resampled, y_resampled)

GradientBoostingClassifier(loss='exponential', max_depth=1,
                           min_samples_split=0.8)

In [17]:
X_train_pred = gbm.predict_proba(X_resampled)[:,1]
X_test_pred = gbm.predict_proba(X_test)[:,1]

print('Train roc_auc score', roc_auc_score(y_resampled, X_train_pred))
print('Test roc_auc score', roc_auc_score(y_test, X_test_pred))

Train roc_auc score 0.9999853316513627
Test roc_auc score 0.7887026849242564


The model overfits the train set and the test set also does not show better performance

### Random UnderSampling
Quite often neglected as it reduces the training set significantly

In [18]:
rus = RandomUnderSampler(
        sampling_strategy='auto',) #undersamples only the majority class

X_resampled, y_resampled=rus.fit_resample(X_train,y_train)

X_train.shape, X_resampled.shape

((15204, 264), (1246, 264))

In [19]:
y_resampled.value_counts(normalize=True), y_resampled.value_counts()

(0    0.5
 1    0.5
 Name: TARGET, dtype: float64,
 0    623
 1    623
 Name: TARGET, dtype: int64)

In [20]:
# Train the model
gbm.fit(X_resampled,y_resampled)

GradientBoostingClassifier(loss='exponential', max_depth=1,
                           min_samples_split=0.8)

In [21]:
# Get the performance of the train and test set

X_train_pred = gbm.predict_proba(X_resampled)[:,1]
X_test_pred = gbm.predict_proba(X_test)[:,1]

print('Train roc_auc: ', roc_auc_score(y_resampled, X_train_pred))
print('Test roc)_auc: ', roc_auc_score(y_test, X_test_pred))

Train roc_auc:  0.8486353763825945
Test roc)_auc:  0.825212977646375


Even with undersampling, the result still doesn't show any marked improvement compared to using all the variables.

### Random Oversampling
In essence duplicates data in the minority class, so may lead to over fitting

In [22]:
ros=RandomOverSampler(sampling_strategy='auto')

X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

X_train.shape, X_resampled.shape # we have more samples now in the minority class

((15204, 264), (29162, 264))

In [23]:
y_resampled.value_counts(normalize=True)

0    0.5
1    0.5
Name: TARGET, dtype: float64

In [24]:
# train the model on the resampled data
gbm.fit(X_resampled,y_resampled)

GradientBoostingClassifier(loss='exponential', max_depth=1,
                           min_samples_split=0.8)

In [25]:
# see performance of the model on the train and test set
X_train_pred = gbm.predict_proba(X_resampled)[:,1]
X_test_pred = gbm.predict_proba(X_test)[:,1]

print('Train roc_auc: ', roc_auc_score(y_resampled, X_train_pred))
print('Test roc_auc: ', roc_auc_score(y_test, X_test_pred))

Train roc_auc:  0.8358508294145702
Test roc_auc:  0.8285176979696205


Still, the model offers a small increase in performance. We would have to use cross-validation and get a measure of the error dispersion to be sure to see if it is within the error of the model.

## SMOTENC
Smote interpolate synthetic data using its 5 nearest neighbors, SMOTE-NC will be used as it works with discrete data.

In [26]:
# first we need to find out the index of features that are categorical of discrete

# capture discrete features in a list
cat_feats = [feat for feat in X_train.columns if X_train[feat].nunique()<=10]

In [27]:
# capture the index in the dataframe columns
cat_feats_index = [cat_feats.index(x) for x in cat_feats]

cat_feats_index[0:6]

[0, 1, 2, 3, 4, 5]

In [28]:
smnc = SMOTENC(
    sampling_strategy='auto', # samples only the minority class
    random_state=0,  # for reproducibility
    k_neighbors=3,
    categorical_features=cat_feats_index # indeces of the columns of discrete variables
)  

# because SMOTE uses KNN, and KNN is sensible to variable magnitude, we re-scale the data

# this procedure will take a while, it also caused memory problems so train/test ratio was increased for test
X_resampled, y_resampled = smnc.fit_resample(MinMaxScaler().fit_transform(X_train), y_train)

X_train.shape, X_resampled.shape

((15204, 264), (29162, 264))

In [29]:
# check the distribution of the resampled target
# we should have 50:50 now

y_resampled.value_counts(normalize=True)

0    0.5
1    0.5
Name: TARGET, dtype: float64

In [30]:
# train the model 

gbm.fit(X_resampled, y_resampled)

GradientBoostingClassifier(loss='exponential', max_depth=1,
                           min_samples_split=0.8)

In [31]:
# Now let's get the performance on train and test

X_train_preds = gbm.predict_proba(X_resampled)[:,1]
X_test_preds = gbm.predict_proba(X_test)[:,1]

print('Train roc_auc: ', roc_auc_score(y_resampled, X_train_preds))
print('Test roc_auc: ', roc_auc_score(y_test, X_test_preds))

Train roc_auc:  0.849827754505443
Test roc_auc:  0.7120819133836906


SMOTENC oversampling method is worse for the outcome.

### Ensemble Methods

RUSBoost and Easy Ensemble, both based on boosting methods tend to return better performance

In [32]:
# load Santander Customer Satisfaction dataset
data = pd.read_csv("..\\train.csv")

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['ID', 'TARGET'],axis=1), data['TARGET'], test_size=0.8,) # test size returned to usual size

X_train.shape, X_test.shape

((15204, 369), (60816, 369))

In [33]:
# set up the RUSBoost ensemble model

rusboost = RUSBoostClassifier(
        base_estimator=None,
        n_estimators=20,
        learning_rate=1.0,
        sampling_strategy='auto',
        random_state=2909,
    )


# train model
rusboost.fit(X_train, y_train)

RUSBoostClassifier(n_estimators=20, random_state=2909)

In [34]:
# Now let's get the performance on train and test

X_train_preds = rusboost.predict_proba(X_train)[:,1]
X_test_preds = rusboost.predict_proba(X_test)[:,1]

print('Train roc_auc: ', roc_auc_score(y_train, X_train_preds))
print('Test roc_auc: ', roc_auc_score(y_test, X_test_preds))


Train roc_auc:  0.839244382115701
Test roc_auc:  0.8054505217026315


In [35]:
easy = EasyEnsembleClassifier(
        n_estimators=10,
        sampling_strategy='auto',
        random_state=2909,
    )


# train model
easy.fit(X_train, y_train)

EasyEnsembleClassifier(random_state=2909)

In [36]:
# Now let's get the performance on train and test

X_train_preds = easy.predict_proba(X_train)[:,1]
X_test_preds = easy.predict_proba(X_test)[:,1]

print('Train roc_auc: ', roc_auc_score(y_train, X_train_preds))
print('Test roc_auc: ', roc_auc_score(y_test, X_test_preds))

Train roc_auc:  0.8587193580814448
Test roc_auc:  0.8027014211682129


Ensemble methods did not improve the performance.

### Cost sensitive approach
Misclassification of the minority class will be penalized at a higher cost

In [37]:
# we have an imbalance of 95 to 5, so we use those as weights
sample_weight = np.where(y_train==1, 95, 5)

# train model
gbm.fit(X_train, y_train, sample_weight)

GradientBoostingClassifier(loss='exponential', max_depth=1,
                           min_samples_split=0.8)

In [38]:
# Now let's get the performance on train and test

X_train_preds = gbm.predict_proba(X_train)[:,1]
X_test_preds = gbm.predict_proba(X_test)[:,1]

print('Train roc_auc: ', roc_auc_score(y_train, X_train_preds))
print('Test roc_auc: ', roc_auc_score(y_test, X_test_preds))

Train roc_auc:  0.8481492357134701
Test roc_auc:  0.8257616648348273


From all the techniques that we tested in this notebook, the benchmark model trained on the entire dataset and the 1 with cost-sensitive learning seem to be the ones that perform the best. So to follow up, we could optimize parameters on these to see if this improves model performance.