# Working with highly imbalanced data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.rcParams["figure.dpi"] = 200
np.set_printoptions(precision=3, suppress=True)
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale, StandardScaler

In [None]:
rng = np.random.RandomState(0)
n_samples_1 = 1000
n_samples_2 = 100
X_syn = np.r_[1.5 * rng.randn(n_samples_1, 2),
        0.5 * rng.randn(n_samples_2, 2) + [2, 2]]
y_syn = np.array([0] * (n_samples_1) + [1] * (n_samples_2))
X_syn_train, X_syn_test, y_syn_train, y_syn_test = train_test_split(X_syn, y_syn)

In [None]:
import openml
# mammography dataset https://www.openml.org/d/310
data = openml.datasets.get_dataset(310)
X, y = data.get_data(target=data.default_target_attribute)

In [None]:
X.shape

In [None]:
np.bincount(y)

In [None]:
df = pd.DataFrame(X)

In [None]:
df.hist(bins='auto')

In [None]:
pd.scatter_matrix(df, c=y, alpha=.2);

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train)
plt.plot(pca.explained_variance_ratio_)

In [None]:
sorting = np.argsort(y_train)
fig, axes = plt.subplots(1, 2)
axes[0].scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=y_train[:], alpha=.3)
axes[1].scatter(X_train_pca[sorting, 0], X_train_pca[sorting, 1], c=y_train[sorting], alpha=.3)

In [None]:
from sklearn.preprocessing import RobustScaler
rs = RobustScaler().fit(X_train)
X_train_scaled = rs.transform(X_train)

In [None]:
pca_scaled = PCA()
X_train_pca_scaled = pca_scaled.fit_transform(X_train_scaled)
plt.plot(pca_scaled.explained_variance_ratio_)

In [None]:
fig, axes = plt.subplots(1, 2)
axes[0].scatter(X_train_pca_scaled[:, 0], X_train_pca_scaled[:, 1], c=y_train[:], alpha=.3)
axes[1].scatter(X_train_pca_scaled[sorting, 0], X_train_pca_scaled[sorting, 1], c=y_train[sorting], alpha=.3)

In [None]:
sorting = np.argsort(y_train)
fig, axes = plt.subplots(1, 2, figsize=(8, 4))
axes[0].scatter(X_train[:, 3], X_train[:, 4], c=y_train[:], alpha=.3)
axes[1].scatter(X_train[sorting, 3], X_train[sorting, 4], c=y_train[sorting], alpha=.3)
axes[0].set_title("Feature 3 vs 4 random order")
axes[1].set_title("Feature 3 vs 4 sorted")

In [None]:
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(LogisticRegression(),
                         X_train, y_train, cv=10, scoring='roc_auc')
print(scores.mean())

In [None]:
from sklearn.linear_model import LogisticRegressionCV
scores = cross_val_score(LogisticRegressionCV(scoring='roc_auc'), X_train, y_train, cv=10, scoring='roc_auc')
print(scores.mean())

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(replacement=False)
X_train_subsample, y_train_subsample = rus.fit_sample(X_train, y_train)
print(X_train.shape)
print(X_train_subsample.shape)
print(np.bincount(y_train_subsample))

In [None]:
from imblearn.pipeline import make_pipeline as make_imb_pipeline

undersample_pipe = make_imb_pipeline(RandomUnderSampler(), LogisticRegressionCV())
scores = cross_val_score(undersample_pipe, X_train, y_train, cv=10, scoring='roc_auc')
print(np.mean(scores))

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_train_oversample, y_train_oversample = ros.fit_sample(X_train, y_train)
print(X_train.shape)
print(X_train_oversample.shape)
print(np.bincount(y_train_oversample))

In [None]:
oversample_pipe = make_imb_pipeline(RandomOverSampler(), LogisticRegression())
scores = cross_val_score(oversample_pipe, X_train, y_train, cv=10, scoring='roc_auc')
print(np.mean(scores))

In [None]:
from sklearn.metrics import roc_curve
oversample_pipe.fit(X_train, y_train)
props_oversample = oversample_pipe.predict_proba(X_test)[:, 1]
fpr_over, tpr_over, _ = roc_curve(y_test, props_oversample)

undersample_pipe.fit(X_train, y_train)
props_undersample = undersample_pipe.predict_proba(X_test)[:, 1]
fpr_under, tpr_under, _ = roc_curve(y_test, props_undersample)

lr = LogisticRegression().fit(X_train, y_train)
props_original = lr.predict_proba(X_test)[:, 1]
fpr_org, tpr_org, _ = roc_curve(y_test, props_original)

plt.plot(fpr_org, tpr_org, label="original")
plt.plot(fpr_over, tpr_over, label="oversample")
plt.plot(fpr_under, tpr_under, label="undersample")
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")

In [None]:
from sklearn.ensemble import RandomForestClassifier
scores = cross_val_score(RandomForestClassifier(n_estimators=100),
                         X_train, y_train, cv=10, scoring='roc_auc')
print(np.mean(scores))

In [None]:
undersample_pipe_rf = make_imb_pipeline(RandomUnderSampler(), RandomForestClassifier())
scores = cross_val_score(undersample_pipe_rf, X_train, y_train, cv=10, scoring='roc_auc')
print(np.mean(scores))

In [None]:
oversample_pipe_rf = make_imb_pipeline(RandomOverSampler(), RandomForestClassifier())
scores = cross_val_score(oversample_pipe_rf, X_train, y_train, cv=10, scoring='roc_auc')
print(np.mean(scores))

In [None]:
from sklearn.metrics import roc_curve
oversample_pipe_rf.fit(X_train, y_train)
props_oversample = oversample_pipe_rf.predict_proba(X_test)[:, 1]
fpr_over, tpr_over, _ = roc_curve(y_test, props_oversample)

undersample_pipe_rf.fit(X_train, y_train)
props_undersample = undersample_pipe_rf.predict_proba(X_test)[:, 1]
fpr_under, tpr_under, _ = roc_curve(y_test, props_undersample)

rf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
props_original = rf.predict_proba(X_test)[:, 1]
fpr_org, tpr_org, _ = roc_curve(y_test, props_original)

plt.plot(fpr_org, tpr_org, label="original")
plt.plot(fpr_over, tpr_over, label="oversample")
plt.plot(fpr_under, tpr_under, label="undersample")
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("RF comparison")

# Class Weights

In [None]:
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(LogisticRegression(class_weight='balanced'),
                         X_train, y_train, cv=10, scoring='roc_auc')
print(scores.mean())

In [None]:
from sklearn.ensemble import RandomForestClassifier
scores = cross_val_score(RandomForestClassifier(n_estimators=100, class_weight='balanced'),
                         X_train, y_train, cv=10, scoring='roc_auc')
print(np.mean(scores))

# Resampled Ensembles

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
def make_resample_tree(random_state=0):
    tree = make_imb_pipeline(RandomUnderSampler(random_state=random_state, replacement=True),
                             DecisionTreeClassifier(max_features='auto', random_state=random_state))
    return "tree_{}".format(random_state), tree
classifiers = [make_resample_tree(i) for i in range(100)]
resampled_rf = VotingClassifier(classifiers, voting='soft')

In [None]:
scores = cross_val_score(resampled_rf, X_train, y_train, cv=10, scoring='roc_auc')
print(np.mean(scores))

In [None]:
from sklearn.base import clone

def make_resampled_ensemble(estimator, n_estimators=100):
    estimators = []
    for i in range(n_estimators):
        est = clone(estimator)
        if hasattr(est, "random_state"):
            est.random_state = i
        pipe = make_imb_pipeline(RandomUnderSampler(random_state=i, replacement=True),
                                 est)
        estimators.append(("est_{}".format(i), pipe))
    return VotingClassifier(estimators, voting="soft")

In [None]:
resampled_tree_test = make_resampled_ensemble(DecisionTreeClassifier(max_features='auto'))

scores = cross_val_score(resampled_tree_test, X_train, y_train, cv=10, scoring='roc_auc')
print(np.mean(scores))

In [None]:
resampled_lr = make_resampled_ensemble(LogisticRegression())

scores = cross_val_score(resampled_lr, X_train, y_train, cv=10, scoring='roc_auc')
print(np.mean(scores))

# SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_sample(X_train, y_train)
print(X_train_smote.shape)
print(np.bincount(y_train_smote))

In [None]:
X_resampled, y_resampled = SMOTE().fit_sample(X_syn, y_syn)
fig, axes = plt.subplots(1, 2)
axes[0].scatter(X_syn[:, 0], X_syn[:, 1], c=plt.cm.Vega10(y_syn), alpha=.3)
axes[1].scatter(X_resampled[:, 0], X_resampled[:, 1], c=plt.cm.Vega10(y_resampled), alpha=.3)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(8, 4))
sorting = np.argsort(y_train)
axes[0].scatter(X_train[sorting, 3], X_train[sorting, 4], c=y_train[sorting], alpha=.3)
axes[1].scatter(X_train_smote[:, 3], X_train_smote[:, 4], c=y_train_smote, alpha=.3)

In [None]:
from sklearn.utils import shuffle
fig, axes = plt.subplots(1, 2, figsize=(8, 4))
X_smote_sh, y_smote_sh = shuffle(X_train_smote, y_train_smote)
axes[0].scatter(X_train[:, 3], X_train[:, 4], c=y_train, alpha=.3)
axes[1].scatter(X_smote_sh[:, 3], X_smote_sh[:, 4], c=y_smote_sh, alpha=.3)

In [None]:
smote_pipe = make_imb_pipeline(SMOTE(), LogisticRegression())
scores = cross_val_score(smote_pipe, X_train, y_train, cv=10, scoring='roc_auc')
print(np.mean(scores))

In [None]:
smote_pipe_rf = make_imb_pipeline(SMOTE(), RandomForestClassifier(n_estimators=100))
scores = cross_val_score(smote_pipe_rf, X_train, y_train, cv=10, scoring='roc_auc')
print(np.mean(scores))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'smote__k_neighbors': [3, 5, 7, 9, 11, 15, 31]}
search = GridSearchCV(smote_pipe_rf, param_grid, cv=10, scoring="roc_auc")
search.fit(X_train, y_train)

In [None]:
search.best_score_

In [None]:
results = pd.DataFrame(search.cv_results_)
results.plot("param_smote__k_neighbors", ["mean_test_score", "mean_train_score"])

In [None]:
smote_pipe_rf = make_imb_pipeline(SMOTE(k_neighbors=11), RandomForestClassifier(n_estimators=100))
scores = cross_val_score(smote_pipe_rf, X_train, y_train, cv=10, scoring='roc_auc')
print(np.mean(scores))

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(k_neighbors=11)
X_train_smote11, y_train_smote11 = smote.fit_sample(X_train, y_train)

fig, axes = plt.subplots(1, 2, figsize=(8, 4))
X_smote_sh11, y_smote_sh11 = shuffle(X_train_smote11, y_train_smote11)
axes[0].scatter(X_smote_sh[:, 3], X_smote_sh[:, 4], c=y_smote_sh, alpha=.3)
axes[1].scatter(X_smote_sh11[:, 3], X_smote_sh11[:, 4], c=y_smote_sh11, alpha=.3)
axes[0].set_title("SMOTE k_neighbors=5")
axes[1].set_title("SMOTE k_neighbors=11")

In [None]:
from imblearn.combine import SMOTEENN, SMOTETomek
smoteenn_pipe_rf = make_imb_pipeline(SMOTEENN(smote=SMOTE(k_neighbors=11)), RandomForestClassifier(n_estimators=100))
scores = cross_val_score(smoteenn_pipe_rf, X_train, y_train, cv=10, scoring='roc_auc')
print(np.mean(scores))

In [None]:
smoteenn = SMOTEENN(smote=SMOTE(k_neighbors=11))

X_train_smoteenn, y_train_smoteenn = smoteenn.fit_sample(X_train, y_train)

fig, axes = plt.subplots(1, 2, figsize=(8, 4))
X_smote_shenn, y_smote_shenn = shuffle(X_train_smoteenn, y_train_smoteenn)
axes[0].scatter(X_smote_sh11[:, 3], X_smote_sh11[:, 4], c=y_smote_sh11, alpha=.3)
axes[1].scatter(X_smote_shenn[:, 3], X_smote_shenn[:, 4], c=y_smote_shenn, alpha=.3)
axes[0].set_title("SMOTE")
axes[1].set_title("SMOTE ENN")

In [None]:
np.bincount(y_train_smote11)

In [None]:
np.bincount(y_train_smoteenn)

In [None]:
from imblearn.under_sampling import CondensedNearestNeighbour

X_resampled, y_resampled = SMOTE().fit_sample(X_syn, y_syn)
X_resampled_enn, y_resampled_enn = SMOTEENN().fit_sample(X_syn, y_syn)
X_resampled_cnn, y_resampled_cnn = make_imb_pipeline(SMOTE(), CondensedNearestNeighbour()).fit_sample(X_syn, y_syn)

fig, axes = plt.subplots(1, 3)

axes[0].scatter(X_resampled[:, 0], X_resampled[:, 1], c=plt.cm.Vega10(y_resampled), alpha=.3)
axes[0].set_title("SMOTE")
axes[1].scatter(X_resampled_enn[:, 0], X_resampled_enn[:, 1], c=plt.cm.Vega10(y_resampled_enn), alpha=.3)
axes[1].set_title("SMOTE ENN")
axes[2].scatter(X_resampled_cnn[:, 0], X_resampled_cnn[:, 1], c=plt.cm.Vega10(y_resampled_cnn), alpha=.3)
axes[2].set_title("SMOTE CNN")