## Imbalanced Dataset Handling
#### 1.Upsampling 
#### 2.Downsampling

In [1]:
import numpy as np
import pandas as pd

**Creating an Imbalanced Dataset:**

In [2]:
#Set the random seed for the reproductibility 

np.random.seed(123) #making numpy random value using seed so that random value dont get change

#Create a dataframe with two classes

n_sample = 1000 #total Number of data points
class_0_ratio = 0.9 #90%
n_class_0 = int(n_sample * class_0_ratio) #1000 * 0.9 = 900 (in class_0)
n_class_1 = n_sample - n_class_0 #100 - 900 = 100 (sample in class_1)


In [3]:
n_class_0, n_class_1 #class_0 having 900 and class1 having 100 data point (9:1)

(900, 100)

In [8]:
class_0 = pd.DataFrame({
    'feature_1' : np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2' : np.random.normal(loc=0, scale=1, size=n_class_0),
    'target' : [0] * n_class_0
})
class_1 = pd.DataFrame({
    'feature_1' : np.random.normal(loc=2, scale=1, size= n_class_1),
    'feature_2' : np.random.normal(loc=2, scale=1, size= n_class_1),
    'target' : [1] * n_class_1
})

In [10]:
class_0.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [11]:
class_1.head()

Unnamed: 0,feature_1,feature_2,target
0,1.699768,2.139033,1
1,1.367739,2.025577,1
2,1.795683,1.803557,1
3,2.213696,3.312255,1
4,3.033878,3.187417,1


In [14]:
df = pd.concat([class_0, class_1]).reset_index(drop = True)

In [15]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [18]:
#Checking total number of 0 and 1 in target

df['target'].value_counts()

0    900
1    100
Name: target, dtype: int64

**Dataset Created:**
-> 900 datapoint have output 0, 100 data points have output 1

### Upsampling: Increasing the minority points.

In [21]:
df_minority = df[df['target']==1]#Defining the minority points
df_majority = df[df['target']==0]#Defining the majority points              ]

In [22]:
df_minority.head()

Unnamed: 0,feature_1,feature_2,target
900,1.699768,2.139033,1
901,1.367739,2.025577,1
902,1.795683,1.803557,1
903,2.213696,3.312255,1
904,3.033878,3.187417,1


In [23]:
df_majority.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [26]:
#Performing the upsampling

from sklearn.utils  import resample

In [32]:
df_minority_upsample = resample(df_minority,
                                replace = True, ##Sample with replacement
                                n_samples= len(df_majority),#To matchthe majority class
                                random_state = 45
)

In [33]:
df_minority_upsample.shape

(900, 3)

In [35]:
df_minority_upsample['target'].value_counts()

1    900
Name: target, dtype: int64

In [36]:
df_upsampled = pd.concat([df_majority, df_minority_upsample])

In [39]:
df_upsampled['target'].value_counts()

0    900
1    900
Name: target, dtype: int64

In [43]:
df_upsampled.shape #now we can see total data points are 1800 (900 one's and 900 zeros)

(1800, 3)

### Downsampling : Reducing the majority points to minority points .

In [50]:
class_0 = pd.DataFrame({
    'feature_1' : np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2' : np.random.normal(loc=0, scale=1, size=n_class_0),
    'target' : [0] * n_class_0
})
class_1 = pd.DataFrame({
    'feature_1' : np.random.normal(loc=2, scale=1, size= n_class_1),
    'feature_2' : np.random.normal(loc=2, scale=1, size= n_class_1),
    'target' : [1] * n_class_1
})

In [51]:
df = pd.concat([class_0, class_1]).reset_index(drop = True)

In [49]:
df_minority = df[df['target']==1]#Defining the minority points
df_majority = df[df['target']==0]#Defining the majority points              

In [None]:
df.va

In [54]:
df_majority_downsample = resample(df_majority,
                                replace = False, ##Sample with replacement
                                n_samples= len(df_minority),#To matchthe majority class
                                random_state = 45
                                 )

In [60]:
df_majority_downsample['target'].value_counts() #from 900 data points,its reduced to 100

0    100
Name: target, dtype: int64

In [61]:
df_majority_downsample.head()

Unnamed: 0,feature_1,feature_2,target
63,-1.326265,-0.309751,0
502,-0.265446,0.761089,0
820,0.763161,-1.956698,0
410,0.38287,0.233407,0
614,0.533511,-0.315713,0


In [None]:

df