# Handling Unbalanced Data

### 1 - UP sampling
### 2 - DOWN sampling

In [30]:
import numpy as np
import pandas as pd

#set the random seed for reproductivity
np.random.seed(123)

## Create a dataframe with two classes
n_sample=1000
class_0_ratio=0.9
n_class_0=int(n_sample*class_0_ratio)
n_class_1=n_sample-n_class_0

In [49]:
## CREATE MY DATAFRAME WITH IMBALANCED DATASET
class_0=pd.DataFrame({
    'feature1':np.random.normal(loc=0,scale=1,size=n_class_0),
    'feature2':np.random.normal(loc=0,scale=1,size=n_class_0),
    'target':[0]*n_class_0
    
})
class_1=pd.DataFrame({
    'feature1':np.random.normal(loc=2,scale=1,size=n_class_1),
    'feature2':np.random.normal(loc=2,scale=1,size=n_class_1),
    'target':[1]*n_class_1
    
})

In [50]:
df = pd.concat([class_0,class_1]).reset_index(drop=True)

In [51]:
df.head()

Unnamed: 0,feature1,feature2,target
0,-0.471276,0.328462,0
1,1.084072,1.03823,0
2,-0.379223,1.147064,0
3,-0.362274,0.638254,0
4,-0.681071,-1.075766,0


In [52]:
df.tail()

Unnamed: 0,feature1,feature2,target
995,2.72069,1.978489,1
996,1.949078,3.709793,1
997,2.709784,3.324917,1
998,1.817689,1.549237,1
999,2.83891,2.240773,1


In [53]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

In [54]:
##upsampling
df_minor = df[df['target']==1]
df_major = df[df['target']==0]

In [55]:
from sklearn.utils import resample

In [56]:
df_minor_upsampled = resample(df_minor,replace=True,
                              n_samples=len(df_major),
                              random_state=43
                              )

In [57]:
df_minor_upsampled.shape
df_minor_upsampled.head()

Unnamed: 0,feature1,feature2,target
968,2.314776,-0.393572,1
964,1.168035,2.507924,1
949,1.589079,0.475801,1
921,1.256916,0.743924,1
958,0.240862,2.148417,1


In [58]:
df_upsampled = pd.concat([df_major,df_minor_upsampled])

In [59]:
df_upsampled['target'].value_counts()

target
0    900
1    900
Name: count, dtype: int64

In [60]:
##DOWN SAMPLING


In [61]:

## Down sampling
class_0=pd.DataFrame({
    'feature1':np.random.normal(loc=0,scale=1,size=n_class_0),
    'feature1':np.random.normal(loc=0,scale=1,size=n_class_0),
    'target':[0]*n_class_0
    
})
class_1=pd.DataFrame({
    'feature1':np.random.normal(loc=2,scale=1,size=n_class_1),
    'feature1':np.random.normal(loc=2,scale=1,size=n_class_1),
    'target':[1]*n_class_1
    
})

df=pd.concat([class_0,class_1]).reset_index(drop=True)

print(df['target'].value_counts())

target
0    900
1    100
Name: count, dtype: int64


In [62]:
## down sampling
df_minor = df[df['target']==1]
df_major = df[df['target']==0]

In [63]:
df_major_downsampled = resample(df_major,replace=False,
                              n_samples=len(df_minor),
                              random_state=43
                              )

In [64]:
df_major_downsampled.shape

(100, 2)

In [65]:
df_downsampled = pd.concat([df_minor,df_major_downsampled])

In [66]:
df_downsampled['target'].value_counts()

target
1    100
0    100
Name: count, dtype: int64