In [1]:
import numpy as np
import pandas as pd

In [2]:
# Set the random seed for reproducibility
np.random.seed(123)

In [3]:
# Create a dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [4]:
n_class_0,n_class_1

(900, 100)

In [6]:
# create data frame for the imbalnced data set
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0), #Draw random samples from a normal (Gaussian) distribution.
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0]*n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_1),
    'target': [1]*n_class_1
})

In [7]:
df=pd.concat([class_0,class_1]).reset_index(drop=True)

In [9]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [10]:
df.tail()

Unnamed: 0,feature_1,feature_2,target
995,-0.623629,0.845701,1
996,0.23981,-1.119923,1
997,-0.86824,-0.359297,1
998,0.902006,-1.609695,1
999,0.69749,0.01357,1


In [11]:
df['target'].value_counts()

0    900
1    100
Name: target, dtype: int64

# upsampling

In [13]:
df_major = df[df['target']==0]

In [14]:
df_minor = df[df['target']==1]

In [15]:
df_major

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.252750,0
4,-0.578600,-0.292004,0
...,...,...,...
895,0.238761,-0.003155,0
896,-1.106386,-0.430660,0
897,0.366732,-0.146416,0
898,1.023906,1.160176,0


In [16]:
df_minor

Unnamed: 0,feature_1,feature_2,target
900,-0.300232,0.139033,1
901,-0.632261,0.025577,1
902,-0.204317,-0.196443,1
903,0.213696,1.312255,1
904,1.033878,1.187417,1
...,...,...,...
995,-0.623629,0.845701,1
996,0.239810,-1.119923,1
997,-0.868240,-0.359297,1
998,0.902006,-1.609695,1


In [17]:
from sklearn.utils import resample

In [18]:
df_minor_upsample = resample(df_minor,replace=True,n_samples=len(df_major),random_state=42)

In [20]:
df_minor_upsample.shape

(900, 3)

In [21]:
df_upsampled = pd.concat([df_major,df_minor_upsample])

In [22]:
df_upsampled['target'].value_counts()

0    900
1    900
Name: target, dtype: int64

In [23]:
df['target'].value_counts()

0    900
1    100
Name: target, dtype: int64

# downsampling

In [24]:
df_down0 = df[df['target']==0]
df_down1 = df[df['target']==1]

In [25]:
type(df_down0)

pandas.core.frame.DataFrame

In [26]:
df_down0,df_down1

(     feature_1  feature_2  target
 0    -1.085631   0.551302       0
 1     0.997345   0.419589       0
 2     0.282978   1.815652       0
 3    -1.506295  -0.252750       0
 4    -0.578600  -0.292004       0
 ..         ...        ...     ...
 895   0.238761  -0.003155       0
 896  -1.106386  -0.430660       0
 897   0.366732  -0.146416       0
 898   1.023906   1.160176       0
 899  -0.210056  -0.641512       0
 
 [900 rows x 3 columns],
      feature_1  feature_2  target
 900  -0.300232   0.139033       1
 901  -0.632261   0.025577       1
 902  -0.204317  -0.196443       1
 903   0.213696   1.312255       1
 904   1.033878   1.187417       1
 ..         ...        ...     ...
 995  -0.623629   0.845701       1
 996   0.239810  -1.119923       1
 997  -0.868240  -0.359297       1
 998   0.902006  -1.609695       1
 999   0.697490   0.013570       1
 
 [100 rows x 3 columns])

In [27]:
df_major_dwonsample=resample(df_down0,replace=True, n_samples=len(df_down1), random_state=42)

In [28]:
df_major_dwonsample

Unnamed: 0,feature_1,feature_2,target
102,0.712265,0.718151,0
435,1.199988,0.574621,0
860,0.304515,-0.759475,0
270,-1.213385,0.675504,0
106,0.179549,-0.202659,0
...,...,...,...
201,-0.598105,1.575650,0
269,0.420180,0.570631,0
862,-0.392309,0.446491,0
815,-0.148405,-0.457929,0


In [30]:
df_major_dwonsample.shape

(100, 3)

In [31]:
df_dwn = pd.concat([df_down1,df_major_dwonsample])

In [34]:
df_dwn['target'].value_counts()

1    100
0    100
Name: target, dtype: int64