Handling Imbalanced Data

1.Up-sample Minority Class
2.Down-sample Majority Class

In [1]:
import pandas as pd
import numpy as np

np.random.seed(123)
n_samples = 1000

class_ratio = 0.9
n_class1 = int(n_samples * class_ratio)
n_class2 = n_samples - n_class1

In [2]:
class1 = np.random.normal(0, 1, n_class1)
class2 = np.random.normal(1, 1, n_class2)

df = pd.DataFrame({
    'feature1': np.concatenate([class1, class2]),
    'feature2': np.random.normal(0, 1, n_samples),
    'target': np.concatenate([np.zeros(n_class1), np.ones(n_class2)])
})

df.to_csv('data.csv', index=False)
df.head()

Unnamed: 0,feature1,feature2,target
0,-1.085631,-0.748827,0.0
1,0.997345,0.567595,0.0
2,0.282978,0.718151,0.0
3,-1.506295,-0.999381,0.0
4,-0.5786,0.474898,0.0


In [3]:
class1.shape, class2.shape

((900,), (100,))

In [4]:
df.tail()

Unnamed: 0,feature1,feature2,target
995,1.634763,0.845701,1.0
996,2.069919,-1.119923,1.0
997,0.090673,-0.359297,1.0
998,1.470264,-1.609695,1.0
999,-0.11143,0.01357,1.0


In [5]:
df['target'].value_counts()

target
0.0    900
1.0    100
Name: count, dtype: int64

In [6]:
# Up Sample
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

In [7]:
from sklearn.utils import resample

df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)

In [8]:
df_minority_upsampled.shape

(900, 3)

In [9]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled['target'].value_counts()

target
0.0    900
1.0    900
Name: count, dtype: int64

In [10]:
# Down Sample
df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)

df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled['target'].value_counts()

target
0.0    100
1.0    100
Name: count, dtype: int64