## Imbalanced Dataset Handling
 1. Up Sampling
 2. Down Sampling

In [73]:
import pandas as pd
import numpy as np

In [74]:
# Set the random seed for reproducibility
"""
When you use np.random.seed(123), you're telling the computer to start 
generating random numbers from a specific point (seed 123). This ensures 
that every time you run your code, you'll get the same sequence of random 
numbers. It's like setting a starting point for randomness, so your results 
are consistent and reproducible. This can be handy when you want to share 
your code with others or debug it because everyone will get the same 
random numbers.
"""
np.random.seed(123)

# Create a dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [75]:
(n_class_0, n_class_1)

(900, 100)

In [76]:
"""loc: This parameter specifies the mean (average) of the normal 
distribution. In this case, it's set to 0, which means the center 
of the distribution is at 0."""

class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

In [77]:
df = pd.concat([class_0, class_1]).reset_index(drop=True)
df

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.252750,0
4,-0.578600,-0.292004,0
...,...,...,...
995,1.376371,2.845701,1
996,2.239810,0.880077,1
997,1.131760,1.640703,1
998,2.902006,0.390305,1


In [78]:
df["target"].value_counts()

0    900
1    100
Name: target, dtype: int64

### Up Sampling

In [79]:
df_minority = df[df["target" ] == 1]
df_majority = df[df["target" ] == 0]

In [80]:
df_majority.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [81]:
from sklearn.utils import resample
"""
resample function to perform minority class upsampling in a dataset. 
This technique is often used in machine learning when dealing with imbalanced datasets, where one 
class has significantly fewer samples than the other(s). By upsampling the minority class, you aim 
to balance the class distribution and potentially improve the performance of your model.

df_minority: This seems to be the DataFrame containing the minority class samples.

replace=True: This parameter indicates whether the sampling should be done with replacement. 
When set to True, it means that each sample in the minority class can be selected more than 
once during the resampling process.

n_samples=len(df_majority): This specifies the number of samples you want to generate in the upsampled 
minority class. In this case, you're trying to make the number of minority class samples equal to the
number of samples in the majority class (df_majority).

random_state=42: This sets the random seed for reproducibility. It ensures that if you run the same 
code with the same random seed, you'll get the same results each time. It's useful for consistent 
experimentation and debugging.
"""

"\nresample function to perform minority class upsampling in a dataset. \nThis technique is often used in machine learning when dealing with imbalanced datasets, where one \nclass has significantly fewer samples than the other(s). By upsampling the minority class, you aim \nto balance the class distribution and potentially improve the performance of your model.\n\ndf_minority: This seems to be the DataFrame containing the minority class samples.\n\nreplace=True: This parameter indicates whether the sampling should be done with replacement. \nWhen set to True, it means that each sample in the minority class can be selected more than \nonce during the resampling process.\n\nn_samples=len(df_majority): This specifies the number of samples you want to generate in the upsampled \nminority class. In this case, you're trying to make the number of minority class samples equal to the\nnumber of samples in the majority class (df_majority).\n\nrandom_state=42: This sets the random seed for reprod

In [82]:
df_minority_upsample = resample(df_minority, 
                                replace=True, # Sample with replacement
                                n_samples=len(df_majority),# To match the majority class
                                random_state=42)

df_minority_upsample

Unnamed: 0,feature_1,feature_2,target
951,1.125854,1.843917,1
992,2.196570,1.397425,1
914,1.932170,2.998053,1
971,2.272825,3.034197,1
960,2.870056,1.550485,1
...,...,...,...
952,1.188902,2.189189,1
965,3.919526,1.980541,1
976,2.810326,3.604614,1
942,3.621531,2.168229,1


In [83]:
df_minority_upsample["target"].value_counts()

1    900
Name: target, dtype: int64

In [84]:
df_upsampled = pd.concat([df_majority, df_minority_upsample])
df_upsampled["target"].value_counts()

0    900
1    900
Name: target, dtype: int64

## Down Sampling

In [85]:
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

df = pd.concat([class_0, class_1]).reset_index(drop=True)
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.774224,0.285744,0
1,-1.201377,0.333279,0
2,1.096257,0.531807,0
3,0.861037,-0.354766,0
4,-1.520367,-1.120815,0


In [86]:
df_minority = df[df["target" ] == 1]
df_majority = df[df["target" ] == 0]

In [87]:
df_majority_downsample = resample(df_majority, 
                                replace=False, # Sample with replacement
                                n_samples=len(df_minority),# To match the minority class
                                random_state=42)

df_majority_downsample.shape

(100, 3)

In [88]:
df_downsampled = pd.concat([df_minority, df_majority_downsample])
df_downsampled["target"].value_counts()

1    100
0    100
Name: target, dtype: int64