In [39]:
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from imblearn.under_sampling import ClusterCentroids
from imblearn.over_sampling import SMOTE

Reference: https://imbalanced-learn.readthedocs.io/en/stable/index.html

# Handling Imbalanced data


1. Random Under sampling
2. Random Over sampling
3. Synthetic Minority Over-sampling Technique
4. Algorithmic Ensemble Techniques like bagging and boosting

https://www.analyticsvidhya.com/blog/2017/03/imbalanced-classification-problem/

# 1. Random Over Sampling

In [29]:
#Generate Fake data
X, y = make_classification(n_samples=5000, n_features=2, n_informative=2,
                            n_redundant=0, n_repeated=0, n_classes=3,
                            n_clusters_per_class=1,
                            weights=[0.01, 0.05, 0.94],
                           class_sep=0.8, random_state=0)

In [30]:
print('Training predictors Shape', X.shape)
print('Training target Shape', y.shape)
print(sorted(Counter(y).items()))

Training predictors Shape (5000, 2)
Training target Shape (5000,)
[(0, 64), (1, 262), (2, 4674)]


In [31]:
ros = RandomOverSampler(random_state=0)
X_sampled, y_sampled = ros.fit_resample(X, y)

In [32]:
print(sorted(Counter(y_sampled).items()))

[(0, 4674), (1, 4674), (2, 4674)]


# 2. Random Under Sampling

In [33]:
X, y = make_classification(n_samples=5000, n_features=2, n_informative=2,
                           n_redundant=0, n_repeated=0, n_classes=3,
                            n_clusters_per_class=1,
                            weights=[0.01, 0.05, 0.94],
                            class_sep=0.8, random_state=0)

In [34]:
print('Training predictors Shape', X.shape)
print('Training target Shape', y.shape)
print(sorted(Counter(y).items()))

Training predictors Shape (5000, 2)
Training target Shape (5000,)
[(0, 64), (1, 262), (2, 4674)]


In [35]:
cc = ClusterCentroids(random_state=0)
X_resampled, y_resampled = cc.fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))

[(0, 64), (1, 64), (2, 64)]


# 3. SMOTE Over Sampling

In [36]:
X, y = make_classification(n_samples=5000, n_features=2, n_informative=2,
                           n_redundant=0, n_repeated=0, n_classes=3,
                            n_clusters_per_class=1,
                            weights=[0.01, 0.05, 0.94],
                            class_sep=0.8, random_state=0)

In [37]:
print('Training predictors Shape', X.shape)
print('Training target Shape', y.shape)
print(sorted(Counter(y).items()))

Training predictors Shape (5000, 2)
Training target Shape (5000,)
[(0, 64), (1, 262), (2, 4674)]


In [42]:
sm = SMOTE(random_state=42)
X_smote, y_smote = sm.fit_resample(X, y)

In [43]:
print(sorted(Counter(y_smote).items()))

[(0, 4674), (1, 4674), (2, 4674)]
