#### Way of handling imbalace data
- Upsampling
- Downsampling

In [29]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.utils import resample

##### Creating Imbalance Dataset

In [30]:
np.random.seed(23)

n_samples = 1000
class_0_ratio = 0.9
class_0_size = int(n_samples*class_0_ratio)
class_1_size = n_samples-class_0_size
print(class_0_size,class_1_size)

900 100


In [31]:
class_0 = pd.DataFrame({
    'feature_1':np.random.normal(loc=0,scale=1,size=class_0_size),
    'feature_2':np.random.normal(loc=0,scale=1,size=class_0_size),
    'target': [0]*class_0_size
}) 
class_1 = pd.DataFrame({
    'feature_1':np.random.normal(loc=2,scale=1,size=class_1_size),
    'feature_2':np.random.normal(loc=2,scale=1,size=class_1_size),
    'target':[1]*class_1_size
})

In [32]:
df = pd.concat([class_0,class_1]).reset_index(drop=True)

In [33]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,0.666988,0.069422,0
1,0.025813,-0.471346,0
2,-0.777619,-0.10968,0
3,0.948634,1.278559,0
4,0.701672,-0.280466,0


In [34]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

##### Upsampling

In [35]:
df_minority = df[df['target']==1]
df_majority = df[df['target']==0]

In [36]:
df_minority_upsample = resample(df_minority,replace=True, n_samples=len(df_majority),random_state=23)

In [37]:
len(df_minority_upsample)

900

In [41]:
df_balanced_upsample = pd.concat([df_majority,df_minority_upsample])
df_balanced_upsample['target'].value_counts()

target
0    900
1    900
Name: count, dtype: int64

##### Downsampling

In [42]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

In [44]:
df_majority_downsample = resample(df_majority,replace=False,n_samples=len(df_minority),random_state=23)

In [45]:
df_balance_downsample = pd.concat([df_majority_downsample,df_minority])
df_balance_downsample['target'].value_counts()

target
0    100
1    100
Name: count, dtype: int64