In [1]:
# Generate Imbalanced Data
from sklearn.datasets import make_classification

random_state=25
x,y = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=2,
                          n_clusters_per_class=2,class_sep=2,flip_y=0,weights=[0.9,0.1], random_state=random_state)

In [2]:
import pandas as pd

df = pd.DataFrame(dict(x=x[:,0], y=x[:,1], label=y))
print("Total number of examples \n",df.label.value_counts())

Total number of examples 
 0    900
1    100
Name: label, dtype: int64


As a standard practice, it is best to split data in training and validation data and then perform oversampling or undersampling on the training data only

In [3]:
from sklearn.model_selection import train_test_split

# Separate input features and target
y = df.label
X = df.drop('label', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random_state)

## Oversampling

### Using [resample](https://scikit-learn.org/stable/modules/generated/sklearn.utils.resample.html) in sklearn
Duplicates existing records

In [4]:
from sklearn.utils import resample

# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
negative = X[X.label==0]
positive = X[X.label==1]

# upsample minority
positive_upsampled = resample(positive,
                          replace=True, # sample with replacement
                          n_samples=len(negative), # match number in majority class
                          random_state=random_state) # reproducible results

# combine majority and upsampled minority
upsampled_train_data = pd.concat([negative, positive_upsampled])
# ready test data
test = pd.concat([X_test, y_test], axis=1)
# check new class counts
print("Upsampled train data\n",upsampled_train_data.label.value_counts())
print("Unaltered test data\n",test.label.value_counts())

Upsampled train data
 1    677
0    677
Name: label, dtype: int64
Unaltered test data
 0    223
1     27
Name: label, dtype: int64


In [5]:
# proof that resample duplicates records
ones=upsampled_train_data.loc[upsampled_train_data['label'] == 1]
# Select duplicate rows except first occurrence based on all columns
duplicate = ones[ones.duplicated()]
elements=[]
array=duplicate[['x','y']].values
for ele in array:
    if(ele[0]==1.5521948346327128 and ele[1]==1.692681622419994):
        elements.append(ele)
len(elements)

11

### Using [Random Oversampler](https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.RandomOverSampler.html) in imblearn
Oversampling by picking samples at random

In [12]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=random_state)

x_ros, y_ros = ros.fit_resample(X_train, y_train)

df = pd.DataFrame(dict(x=x_ros[:,0], y=x_ros[:,1], label=y_ros))
print("Results of ROS \n",df.label.value_counts())

Results of ROS 
 1    677
0    677
Name: label, dtype: int64


### Using [SMOTE](https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.RandomOverSampler.html) in imblearn and [related techniques](https://imbalanced-learn.readthedocs.io/en/stable/auto_examples/over-sampling/plot_comparison_over_sampling.html#sphx-glr-auto-examples-over-sampling-plot-comparison-over-sampling-py)
Synthetic examples created using nearest neighbour approach

In [14]:
from imblearn.over_sampling import BorderlineSMOTE, SVMSMOTE, SMOTE

sm = SMOTE(random_state=random_state)
bsm1=BorderlineSMOTE(random_state=random_state, kind='borderline-1')
svmsm=SVMSMOTE(random_state=random_state)

x_res, y_res = sm.fit_resample(X_train, y_train)
x_bsmote1, y_bsmote1 = bsm1.fit_resample(X_train, y_train)
x_svmsm, y_svmsm = svmsm.fit_resample(X_train, y_train)

df = pd.DataFrame(dict(x=x_res[:,0], y=x_res[:,1], label=y_res))
print("Results of SMOTE \n",df.label.value_counts())
df = pd.DataFrame(dict(x=x_bsmote1[:,0], y=x_bsmote1[:,1], label=y_bsmote1))
print("Results of Borderline SMOTE \n",df.label.value_counts())
df = pd.DataFrame(dict(x=x_svmsm[:,0], y=x_svmsm[:,1], label=y_svmsm))
print("Results of SVM SMOTE \n",df.label.value_counts())

Results of SMOTE 
 1    677
0    677
Name: label, dtype: int64
Results of Borderline SMOTE 
 0    677
1     73
Name: label, dtype: int64
Results of SVM SMOTE 
 0    677
1    411
Name: label, dtype: int64


[ADASYN](https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.ADASYN.html) and [SMOTENC](https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTENC.html) are some more approaches, but I have found them to cause errors while using the API

## Undersampling

### Using [resample](https://scikit-learn.org/stable/modules/generated/sklearn.utils.resample.html) in sklearn
Undersampling majority class by randomly picking off elements in majority class

In [17]:
from sklearn.utils import resample

# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
negative = X[X.label==0]
positive = X[X.label==1]

negative_downsampled = resample(negative,
                                replace = False, # sample without replacement
                                n_samples = len(positive), # match minority
                                random_state = random_state) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([negative_downsampled, positive])

# checking counts
downsampled.label.value_counts()

1    73
0    73
Name: label, dtype: int64

### [Undersampling](https://imbalanced-learn.readthedocs.io/en/stable/api.html#module-imblearn.under_sampling) using the imblearn API
## A combination of oversampling and oversampling techniques like [SMOTEEN and SMOTETomek](https://imbalanced-learn.readthedocs.io/en/stable/api.html#module-imblearn.under_sampling) also gives good results
### Check out this [Towards data science](https://towardsdatascience.com/methods-for-dealing-with-imbalanced-data-5b761be45a18) link too