## Handling Imbalanced Dataset
1.Up sampling <br>
2.Down sampling

In [1]:
import numpy as np
import pandas as pd

#### Creating Imbalanced Dataset

In [2]:
np.random.seed(123)
sample = 1000
class_0_ratio = 0.9
class_0 = int(sample*class_0_ratio)
class_1 = (sample - class_0)

In [3]:
class_0,class_1

(900, 100)

In [4]:
data_0 = pd.DataFrame({
    'Feature_1':np.random.normal(0,1,class_0),
    'Feature_2':np.random.normal(0,1,class_0),
    'Target':[0]*class_0
                  })
data_1 = pd.DataFrame({
    'Feature_1':np.random.normal(2,1,class_1),
    'Feature_2':np.random.normal(2,1,class_1),
    'Target':[1]*class_1
                  })

In [5]:
data_0.head()

Unnamed: 0,Feature_1,Feature_2,Target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [6]:
data_0['Target'].value_counts()

0    900
Name: Target, dtype: int64

In [7]:
data_1.head()

Unnamed: 0,Feature_1,Feature_2,Target
0,1.699768,2.139033,1
1,1.367739,2.025577,1
2,1.795683,1.803557,1
3,2.213696,3.312255,1
4,3.033878,3.187417,1


In [8]:
data_1['Target'].value_counts()

1    100
Name: Target, dtype: int64

In [9]:
df = pd.concat([data_0,data_1])

In [10]:
df.head()

Unnamed: 0,Feature_1,Feature_2,Target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [11]:
df['Target'].value_counts()

0    900
1    100
Name: Target, dtype: int64

### Up sampling

In [12]:
major = df.query("Target == 0")

In [13]:
major

Unnamed: 0,Feature_1,Feature_2,Target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.252750,0
4,-0.578600,-0.292004,0
...,...,...,...
895,0.238761,-0.003155,0
896,-1.106386,-0.430660,0
897,0.366732,-0.146416,0
898,1.023906,1.160176,0


In [14]:
minor = df.query("Target == 1")

In [15]:
minor

Unnamed: 0,Feature_1,Feature_2,Target
0,1.699768,2.139033,1
1,1.367739,2.025577,1
2,1.795683,1.803557,1
3,2.213696,3.312255,1
4,3.033878,3.187417,1
...,...,...,...
95,1.376371,2.845701,1
96,2.239810,0.880077,1
97,1.131760,1.640703,1
98,2.902006,0.390305,1


In [16]:
from sklearn.utils import resample

In [17]:
## performing upsampling
minor_upsample = resample(
                            minor,
                            replace = True, # sample with replacement
                            n_samples = len(major) # to match the mojor class
)

In [18]:
minor_upsample.shape

(900, 3)

In [19]:
minor_upsample["Target"].value_counts()

1    900
Name: Target, dtype: int64

In [20]:
upsampled_data = pd.concat([major,minor_upsample])

In [21]:
upsampled_data.shape

(1800, 3)

In [22]:
upsampled_data['Target'].value_counts() 

0    900
1    900
Name: Target, dtype: int64

## Down sampling

In [23]:
data_0 = pd.DataFrame({
    'Feature_1':np.random.normal(0,1,class_0),
    'Feature_2':np.random.normal(0,1,class_0),
    'Target':[0]*class_0
                  })
data_1 = pd.DataFrame({
    'Feature_1':np.random.normal(2,1,class_1),
    'Feature_2':np.random.normal(2,1,class_1),
    'Target':[1]*class_1
                  })

In [24]:
df = pd.concat([data_0,data_1])

In [25]:
major = df.query("Target == 0")
minor = df.query("Target == 1")

In [26]:
## performing down sampling
major_downsample = resample(
                            major,
                            replace = False, # sample without replacement
                            n_samples = len(minor) # to match the minor class
)

In [27]:
downsampled_data = pd.concat([major_downsample,minor])

In [28]:
downsampled_data.shape

(200, 3)

In [29]:
downsampled_data["Target"].value_counts()

0    100
1    100
Name: Target, dtype: int64