# handling imbalanced dataset 

In [1]:
import numpy as np
import pandas as pd

# set random seed for reproducibility
np.random.seed(123)

# create a dataframe with two classes

n_samples = 1000
class_0_rate = 0.9
n_class_0 = int(n_samples * class_0_rate)
n_class_1 = n_samples - n_class_0

In [2]:
n_class_0, n_class_1

(900, 100)

In [3]:
# create my dataframe with imbalaced dataset
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0]* n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_1),
    'target': [1]* n_class_1       })

In [6]:
df=pd.concat([class_0, class_1]).reset_index(drop=True)

In [7]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [8]:
df.tail()  # Display the last few rows of the dataframe

Unnamed: 0,feature_1,feature_2,target
995,-0.623629,0.845701,1
996,0.23981,-1.119923,1
997,-0.86824,-0.359297,1
998,0.902006,-1.609695,1
999,0.69749,0.01357,1


In [9]:
df['target'].value_counts()  # Check the distribution of classes

target
0    900
1    100
Name: count, dtype: int64

## Upsampling 
Minority class এর sample বাড়িয়ে majority class এর সাথে balance করা

In [None]:
df_minority = df[df['target'] == 1]  # Separate minority class (class 1)
df_majority = df[df['target'] == 0]  # Separate majority class (class 0)

In [17]:
from sklearn.utils import resample

df_minority_upsampled = resample(df_minority, 
                                replace=True, 
                                n_samples=len(df_majority), 
                                random_state=42)

In [None]:
df_minority_upsampled.shape # Check shape of upsampled minority class

(100, 3)

In [None]:
df_minority_upsampled.head() # Display first few rows of upsampled minority class

Unnamed: 0,feature_1,feature_2,target
102,0.712265,0.718151,0
435,1.199988,0.574621,0
860,0.304515,-0.759475,0
270,-1.213385,0.675504,0
106,0.179549,-0.202659,0


In [None]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled]) # Combine majority and upsampled minority classes
print(f"Upsampled dataset shape: {df_upsampled.shape}") # Print shape of balanced dataset

In [None]:
df_upsampled['target'].value_counts() # Check the distribution of classes after upsampling

target
1    100
0    100
Name: count, dtype: int64

## Downsampling
Majority class এর sample কমিয়ে minority class এর সাথে balance করা

In [26]:
# Downsampling: majority class কে minority class এর সমান করা
df_majority_downsampled = resample(df_majority, 
                                  replace=False, 
                                  n_samples=len(df_minority), 
                                  random_state=42)

In [27]:
df_majority_downsampled.shape # Check shape of downsampled majority class

(100, 3)

In [28]:
df_majority_downsampled.head() # Display first few rows of downsampled majority class

Unnamed: 0,feature_1,feature_2,target
70,0.468439,1.72092,0
827,1.089165,-0.464899,0
231,0.753869,-0.969798,0
588,0.588686,-0.70472,0
39,0.283627,1.012868,0


In [29]:
df_downsampled = pd.concat([df_majority_downsampled, df_minority]) # Combine downsampled majority and minority classes
print(f"Downsampled dataset shape: {df_downsampled.shape}") # Print shape of balanced dataset

Downsampled dataset shape: (200, 3)


In [30]:
df_downsampled['target'].value_counts() # Check the distribution of classes after downsampling

target
0    100
1    100
Name: count, dtype: int64