## Handling imbalanced data - 2 techniques
1 Up Sampling
2 Down Sampling

In [2]:
import numpy as np
import pandas as pd

# set the random seed for reproducibility
np.random.seed(123)

# Create a dataframe with two classes
n_samples = 1000
class_0_ratio=0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [3]:
n_class_0,n_class_1

(900, 100)

In [6]:
## create dataframe with 2 features and imbalanced data
class_0=pd.DataFrame({
    'feature_1': np.random.normal(loc=0,scale=1,size=n_class_0),
    'feature_2': np.random.normal(loc=0,scale=1,size=n_class_0),
    'target': [0] * n_class_0
})

class_1=pd.DataFrame({
    'feature_1': np.random.normal(loc=2,scale=1,size=n_class_1),
    'feature_2': np.random.normal(loc=2,scale=1,size=n_class_1),
    'target': [1] * n_class_1
})

In [7]:
class_0,class_1

(     feature_1  feature_2  target
 0    -0.300232   0.667532       0
 1    -0.632261   0.100458       0
 2    -0.204317  -0.012610       0
 3     0.213696   0.219907       0
 4     1.033878   0.813623       0
 ..         ...        ...     ...
 895   0.356445  -0.486628       0
 896  -0.266376  -0.818430       0
 897   0.804753  -1.138029       0
 898  -1.750640   1.062592       0
 899   1.539891  -0.831040       0
 
 [900 rows x 3 columns],
     feature_1  feature_2  target
 0    1.632581   3.300921       1
 1    2.575273   1.069348       1
 2    2.439351   1.148687       1
 3    1.271848   2.136584       1
 4    1.111470   1.575746       1
 ..        ...        ...     ...
 95   0.438134   4.540514       1
 96   1.232181   1.917294       1
 97   2.387223   2.444621       1
 98   0.787082   3.896404       1
 99   4.018714   2.237581       1
 
 [100 rows x 3 columns])

In [8]:
df=pd.concat([class_0,class_1]).reset_index(drop=True)

In [9]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-0.300232,0.667532,0
1,-0.632261,0.100458,0
2,-0.204317,-0.01261,0
3,0.213696,0.219907,0
4,1.033878,0.813623,0


In [10]:
df.tail()

Unnamed: 0,feature_1,feature_2,target
995,0.438134,4.540514,1
996,1.232181,1.917294,1
997,2.387223,2.444621,1
998,0.787082,3.896404,1
999,4.018714,2.237581,1


In [20]:
df['target'].value_counts()

0    900
1    100
Name: target, dtype: int64

In [12]:
## upsampling - increase the minority that is 1 value
## so we will create a variable where target =1 

df_minority=df[df['target']==1]
df_majority=df[df['target']==0]



In [14]:
from sklearn.utils import resample
 ## resample library extrapolates minority data , replace means those 1 which are increasing will replace 0s 
    
df_minority_upsampled=resample(df_minority,replace=True, 
        n_samples=len(df_majority),
        random_state=42)

In [15]:
df_minority_upsampled.shape

(900, 3)

In [16]:
df_minority_upsampled.head()

Unnamed: 0,feature_1,feature_2,target
951,1.168666,2.305451,1
992,1.885705,2.954339,1
914,4.417832,3.790856,1
971,3.666319,1.731246,1
960,0.990663,1.305364,1


In [18]:
## you can now concat df_majority with df_minority_upsampled
df_upsampled=pd.concat([df_majority,df_minority_upsampled])
df_upsampled

Unnamed: 0,feature_1,feature_2,target
0,-0.300232,0.667532,0
1,-0.632261,0.100458,0
2,-0.204317,-0.012610,0
3,0.213696,0.219907,0
4,1.033878,0.813623,0
...,...,...,...
952,1.783976,0.627095,1
965,2.165885,0.905984,1
976,1.921324,0.480405,1
942,1.518157,1.312561,1


In [19]:
df_upsampled['target'].value_counts()

0    900
1    900
Name: target, dtype: int64

In [None]:
## Down Sampling - create same dataset 

In [21]:
import numpy as np
import pandas as pd

# set the random seed for reproducibility
np.random.seed(123)

# Create a dataframe with two classes
n_samples = 1000
class_0_ratio=0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

## create dataframe with 2 features and imbalanced data
class_0=pd.DataFrame({
    'feature_1': np.random.normal(loc=0,scale=1,size=n_class_0),
    'feature_2': np.random.normal(loc=0,scale=1,size=n_class_0),
    'target': [0] * n_class_0
})

class_1=pd.DataFrame({
    'feature_1': np.random.normal(loc=2,scale=1,size=n_class_1),
    'feature_2': np.random.normal(loc=2,scale=1,size=n_class_1),
    'target': [1] * n_class_1
})

df=pd.concat([class_0,class_1]).reset_index(drop=True)


In [22]:
df['target'].value_counts()

0    900
1    100
Name: target, dtype: int64

In [23]:
## create same minority and majority
df_minority=df[df['target']==1]
df_majority=df[df['target']==0]

In [25]:
from sklearn.utils import resample
 
    
df_majority_downsampled=resample(df_majority,replace=False, # beacsue we want to reduce the data points
        n_samples=len(df_minority),
        random_state=42)

In [26]:
df_majority_downsampled.shape

(100, 3)

In [27]:
df_downsampled=pd.concat([df_minority,df_majority_downsampled])
df_downsampled

Unnamed: 0,feature_1,feature_2,target
900,1.699768,2.139033,1
901,1.367739,2.025577,1
902,1.795683,1.803557,1
903,2.213696,3.312255,1
904,3.033878,3.187417,1
...,...,...,...
398,-0.168426,0.553775,0
76,-0.403366,0.081491,0
196,-0.269293,0.611238,0
631,-0.295829,0.671673,0


In [28]:
df_downsampled['target'].value_counts()

1    100
0    100
Name: target, dtype: int64

In [None]:
## disadvantages of down  sampling - we loose data points ; so upsampling is appropriate