In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]

import pdb
import pandas as pd
import numpy as np
import simpy
import random

from sklearn.metrics import confusion_matrix, recall_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

## Apply SMOTE to generate more examples of low-incidence state

In [None]:
## data sourced from https://archive.ics.uci.edu/ml/machine-learning-databases/cmc/
cmc = pd.read_csv("cmc.data", header = None)
cmc.head()

In [None]:
cmc.columns = ['age', 'education', 'husband_education', 'num_children', 'religion', 'works',
              'husband_occupation', 'sol_index', 'media_exposure', 'contracep_method']
cmc.head()

In [None]:
plt.hist(cmc.contracep_method)

In [None]:
333/(333 + 629 + 511)

In [None]:
smote_obj = SMOTE()

In [None]:
X = cmc.iloc[:, 0:9]
y = cmc.iloc[:, 9]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
set(y_test)

In [None]:
plt.hist(y_test)

## train a classifier

In [None]:
np.random.seed(21)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
clf = RandomForestClassifier(n_estimators=10)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
confusion_matrix(y_test, clf.predict(X_test))

In [None]:
42  / (35 + 42 + 43)

In [None]:
X_train_smote, y_train_smote = smote_obj.fit_sample(X_train, y_train)

### Exercise: examine some feature distibutions post oversampling

### Does SMOTE improve the model?

In [None]:
np.bincount(y_train)

In [None]:
np.random.seed(21)
clf = RandomForestClassifier(n_estimators=10)
clf.fit(X_train_smote, y_train_smote)
clf.score(X_test, y_test)

In [None]:
confusion_matrix(y_test, clf.predict(X_test))

In [None]:
51 / (51 + 33 + 36)

## Repeat this analysis with a less balanced data set

In [None]:
## data from https://archive.ics.uci.edu/ml/datasets/Bank+Marketing#
bank_df = pd.read_csv("bank.csv", index_col=0)
print(bank_df.head())
print(bank_df.groupby("y").count())

## Convert categories to numeric labels

In [None]:
bank_df.default = bank_df.default.map({"yes": 1, "no": 0})
bank_df.housing = bank_df.housing.map({"yes": 1, "no": 0})
bank_df.loan    = bank_df.loan.map   ({"yes": 1, "no": 0})
bank_df.y       = bank_df.y.map      ({"yes": 1, "no": 0})

bank_df.education = bank_df.education.map({"unknown": -1, "primary": 0, "secondary": 1, "tertiary": 2})
bank_df.marital   = bank_df.marital.map({"married": 0, "single"   : 1, "divorced": 2})
bank_df.job = bank_df.job.apply(lambda x: 0 if x == 'unemployed' or x == 'unknown' else 1)

In [None]:
X = bank_df.iloc[:, 0:7]
y = bank_df.iloc[:, 7]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train.shape

In [None]:
clf = RandomForestClassifier(n_estimators=5)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
confusion_matrix(y_test, clf.predict(X_test))

## Exercise: Oversample the minority class with the SMOTE method and examine whether it has changed the data distribution

## Let's take a look at the source code

In [None]:
# from the source code
def _sample(self, X, y):
    # FIXME: uncomment in version 0.6
    # self._validate_estimator()

    X_resampled = X.copy()
    y_resampled = y.copy()

    for class_sample, n_samples in self.sampling_strategy_.items():
        if n_samples == 0:
            continue
        target_class_indices = np.flatnonzero(y == class_sample)
        X_class = safe_indexing(X, target_class_indices)

        self.nn_k_.fit(X_class)
        nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
        X_new, y_new = self._make_samples(X_class, y.dtype, class_sample, ######################
                                          X_class, nns, n_samples, 1.0)   ######################

        if sparse.issparse(X_new):
            X_resampled = sparse.vstack([X_resampled, X_new])
            sparse_func = 'tocsc' if X.format == 'csc' else 'tocsr'
            X_resampled = getattr(X_resampled, sparse_func)()
        else:
            X_resampled = np.vstack((X_resampled, X_new))
        y_resampled = np.hstack((y_resampled, y_new))

    return X_resampled, y_resampled


In [None]:
def _make_samples(self,
                  X,
                  y_dtype,
                  y_type,
                  nn_data,
                  nn_num,
                  n_samples,
                  step_size=1.):
    """A support function that returns artificial samples constructed along
    the line connecting nearest neighbours.
    Parameters
    ----------
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Points from which the points will be created.
    y_dtype : dtype
        The data type of the targets.
    y_type : str or int
        The minority target value, just so the function can return the
        target values for the synthetic variables with correct length in
        a clear format.
    nn_data : ndarray, shape (n_samples_all, n_features)
        Data set carrying all the neighbours to be used
    nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours)
        The nearest neighbours of each sample in `nn_data`.
    n_samples : int
        The number of samples to generate.
    step_size : float, optional (default=1.)
        The step size to create samples.
    Returns
    -------
    X_new : {ndarray, sparse matrix}, shape (n_samples_new, n_features)
        Synthetically generated samples.
    y_new : ndarray, shape (n_samples_new,)
        Target values for synthetic samples.
    """
    random_state = check_random_state(self.random_state)
    samples_indices = random_state.randint(
        low=0, high=len(nn_num.flatten()), size=n_samples)
    steps = step_size * random_state.uniform(size=n_samples)
    rows = np.floor_divide(samples_indices, nn_num.shape[1])        ######################
    cols = np.mod(samples_indices, nn_num.shape[1])                 ######################

    y_new = np.array([y_type] * len(samples_indices), dtype=y_dtype)

    if sparse.issparse(X):
        row_indices, col_indices, samples = [], [], []
        for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
            if X[row].nnz:
                sample = self._generate_sample(X, nn_data, nn_num,    ######################
                                               row, col, step)        ######################
                row_indices += [i] * len(sample.indices)
                col_indices += sample.indices.tolist()
                samples += sample.data.tolist()
        return (sparse.csr_matrix((samples, (row_indices, col_indices)),
                                  [len(samples_indices), X.shape[1]],
                                  dtype=X.dtype),
                y_new)
    else:
        X_new = np.zeros((n_samples, X.shape[1]), dtype=X.dtype)
        for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
            X_new[i] = self._generate_sample(X, nn_data, nn_num,
                                             row, col, step)
        return X_new, y_new

In [None]:
def _generate_sample(self, X, nn_data, nn_num, row, col, step):
    r"""Generate a synthetic sample.
    The rule for the generation is:
    .. math::
       \mathbf{s_{s}} = \mathbf{s_{i}} + \mathcal{u}(0, 1) \times
       (\mathbf{s_{i}} - \mathbf{s_{nn}}) \,
    where \mathbf{s_{s}} is the new synthetic samples, \mathbf{s_{i}} is
    the current sample, \mathbf{s_{nn}} is a randomly selected neighbors of
    \mathbf{s_{i}} and \mathcal{u}(0, 1) is a random number between [0, 1).
    Parameters
    ----------
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Points from which the points will be created.
    nn_data : ndarray, shape (n_samples_all, n_features)
        Data set carrying all the neighbours to be used.
    nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours)
        The nearest neighbours of each sample in `nn_data`.
    row : int
        Index pointing at feature vector in X which will be used
        as a base for creating new sample.
    col : int
        Index pointing at which nearest neighbor of base feature vector
        will be used when creating new sample.
    step : float
        Step size for new sample.
    Returns
    -------
    X_new : {ndarray, sparse matrix}, shape (n_features,)
        Single synthetically generated sample.
    """
    return X[row] - step * (X[row] - nn_data[nn_num[row, col]])