In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

# Imbalanced Dataset Experiments

This is a quick and simple demonstration of the type of error which can result from improper use of resampling to balance a dataset before training a model.

We use a simple toy dataset using scikit-learn's "make_classification" function

In [2]:
def generate_toy_dataset():
    ''' the characteristics of the dataset affect the degree to which we see this issue manifest
        easily separable datasets with confidently correct models may not have the same issue as 
        noisier datasets with less confident models
    '''
    # Generating an unbalanced dataset
    X, y = make_classification(
        weights=[0.9,0.1], # Setting weights so classes are unbalanced
        n_samples=100000,
        n_features=10,
        n_informative=3,
        n_redundant=2,
        n_classes=2,
        class_sep=0.5,
        random_state=42
    )
    df = pd.DataFrame(X)
    df.columns = df.columns.astype(str)
    df['target'] = y
    return df

generate_toy_dataset().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,target
0,2.524584,0.412494,-1.802713,-0.075889,0.744249,-0.555693,0.1791,-2.902355,-1.032001,0.620666,0
1,1.223714,0.353656,0.433975,-1.692863,1.798698,-0.49316,0.735316,-2.147366,0.918143,-0.925999,0
2,-2.37577,0.091193,-1.31136,-0.922734,-1.721759,2.272621,0.543799,3.121716,-1.228138,-1.326703,0
3,-0.040756,0.818655,1.231301,1.58436,-2.103652,0.939735,-0.770357,1.072464,-0.52116,1.035603,1
4,0.716362,0.052499,-0.947138,1.301603,-0.710176,-0.206302,0.10033,-0.332693,-3.112213,-0.95548,0


# Experiment code

This experiment trains and applies a random forest model both with and without resampling the training set.

In each of the 2 experiments, a test set is first broken off and set aside to simulate "production data" that the model will be applied to in the future.

For the "resampling" experiment, we use a downsampler.  If you like, you can try this with an upsampler as well.  This will take longer to run and tends to make the results slightly better and less variable but should not change the overall conclusion. 

For the "non resampled" experiment, we just feed the data in as-is without and modification.

The experiment:
    - resample the data (or not)
    - break the data into train and validation sets
    - train a model on the training set
    - optimize a decision threshold using the validation set
    - apply the model with this threshold to the "production" dataset
    - evaluate the f1 statistic on the "production" dataset and compare to the results on the validation set
    
Do this twice, and compare the results (resampling vs not resampling)

In [3]:
def choose_best_threshold(probas, target):
    ''' iterates over thresholds to choose the one with the highest f1 score '''
    results = dict()
    precision, recall, thresholds = precision_recall_curve(target, probas)
    f1_scores = 2 * recall * precision / (recall + precision)
    best_f1_ind = np.argmax(f1_scores)
    results['best_threshold'] = thresholds[best_f1_ind]
    results['validation_f1_score']  = np.max(f1_scores)
    results['precision']      = precision[best_f1_ind]
    results['recall']         = recall[best_f1_ind]
    return results 
    
def train_model_and_do_validation_predict(df_train, df_valid, target):
    ''' trains the model and predicts on validation set
        returns model and validation set predictions'''
    X, y = df_train.drop(columns=target), df_train[target]
    X_valid = df_valid.drop(columns=target)
    model = RandomForestClassifier(n_jobs=-1, n_estimators=256, min_samples_leaf=20, random_state=42).fit(X, y)
    return model, model.predict_proba(X_valid)[:, 1]

def score_production_predictions(df_prod, target, model, threshold):
    ''' computes the f1 score on the "production" dataset with the provided threshold '''
    X_prod = df_prod.drop(columns=target)
    P_prod = model.predict_proba(X_prod)[:, 1] > threshold
    return f1_score(df_prod[target], P_prod)


def train_optimize_and_predict_on_prod(train_val, df_prod, target):
    ''' the full test, validate, tune threhsold, and predict on "production data" sequence '''
    train, val = train_test_split(train_val, test_size=0.2, random_state=42)
    model, P_val = train_model_and_do_validation_predict(train, val, target)
    results = choose_best_threshold(P_val, val[target])
    prod_f1_score = score_production_predictions(df_prod, target, model, results['best_threshold'])
    results['prod_f1_score'] = prod_f1_score
    return results

def run_experiment(df, target):
    # set aside test set for final evaluation
    #     held out "test set" is called "df_prod" to denote that it's
    #     our best representation of what the model will be encountering "in prod"
    #     having the "untouched" class balance 
    train_val, df_prod = train_test_split(df, test_size=0.2, random_state=42) 

    # resampler - try different resamplers if you like
    resampler = RandomUnderSampler(sampling_strategy=1.0, random_state=42) # 50/50 balanced undersampling
    train_val_resampled, _ = resampler.fit_sample(train_val, train_val[target])
        
    # experiment 1: 
    #  - train model on resampled data
    #  - pick optimal threshold based on validation set f1 score
    #  - predict on "production_data" and compare to our expectations
    #    as defined by validation set results
    resampled_results = train_optimize_and_predict_on_prod(train_val_resampled, df_prod, target)
    
    # experiment 2: 
    #   - same as experiment 1 but do not resample the data first
    raw_results = train_optimize_and_predict_on_prod(train_val, df_prod, target)
    
    results = pd.DataFrame([resampled_results, raw_results], index=['resampled', 'non-resampled']).T.round(2)
    return results

In [4]:
run_experiment(df=generate_toy_dataset(), target='target')

Unnamed: 0,resampled,non-resampled
best_threshold,0.43,0.21
validation_f1_score,0.79,0.5
precision,0.74,0.42
recall,0.84,0.6
prod_f1_score,0.38,0.49


# Results

Resampling:
- estimated f1 = 79%
- actual f1 = 38%

No Resampling:
- estimated f1 = 50%
- actual f1 = 49%

Using a resampled validation set inflated the apparent performance of the model (79% f1) while delivering a much lower performance on the actual dataset (38% f1).  This is one of the biggest risks in making this error - that the model will be deployed under the expectation of substantially greater performance than is actually attained on production data post-deployment.

Compare this to the non-resampled dataset.  The actual performance on "production data" is much better (49% compared to 38%) and the estimated f1 based on the dataset is much closer to the production performance (50% on the validation set vs 49% on the production set).

This is a common phenomenon when care isn't taken while using resampling.  This is only one of several related types of errors resampling can cause you to make if it is not implemented carefully.
