In [None]:
#Upsampling

In [2]:
# Import NumPy library for numerical operations
import numpy as np

# Import Pandas library for data manipulation and analysis
import pandas as pd

# Set a fixed random seed to ensure reproducibility
# This means the random numbers generated will be the same every time the code runs
np.random.seed(123)

# Define the total number of samples (rows) in the dataset
n_samples = 1000

# Define the proportion of samples that belong to class 0 (majority class)
# Here, 90% of the data will belong to class 0
class_0_ratio = 0.9

# Calculate the number of samples for class 0
# Multiply total samples by class_0_ratio and convert to integer
n_class_0 = int(n_samples * class_0_ratio)

# Calculate the number of samples for class 1 (minority class)
# This is the remaining portion of the dataset
n_class_1 = n_samples - n_class_0


In [4]:
n_class_0,n_class_1 #Number of data set in each class we created

(900, 100)

In [5]:
# Create a DataFrame for Class 0 (majority class)
class_0 = pd.DataFrame({

    # Generate feature_1 values from a normal distribution
    # loc=0  → mean of the distribution
    # scale=1 → standard deviation
    # size=n_class_0 → number of samples for class 0
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),

    # Generate feature_2 values from the same normal distribution
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),

    # Create the target column and label all samples as class 0
    'target': [0] * n_class_0
})

# Create a DataFrame for Class 1 (minority class)
class_1 = pd.DataFrame({

    # Generate feature_1 values from a normal distribution
    # Mean is shifted to 2 to make this class distinguishable from class 0
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),

    # Generate feature_2 values with the same distribution as feature_1
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),

    # Create the target column and label all samples as class 1
    'target': [1] * n_class_1
})

In [10]:
# Combine the DataFrames of class 0 and class 1 into a single DataFrame
# This stacks the rows of class_1 below class_0
df = pd.concat([class_0,class_1]).reset_index(drop=True)
# Reset the index of the combined DataFrame so it runs from 0 to n-1
# drop=True removes the old index instead of keeping it as a column

In [11]:
#See the head() of the data
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [12]:
#See the tail() of the data
df.tail()

Unnamed: 0,feature_1,feature_2,target
995,1.376371,2.845701,1
996,2.23981,0.880077,1
997,1.13176,1.640703,1
998,2.902006,0.390305,1
999,2.69749,2.01357,1


In [13]:
#Value count of the target
df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,900
1,100


In [None]:
#Now we will do upsampling, for this we will increase the number of 1s

In [17]:
# Create a new DataFrame that contains only the minority class samples
# This filters rows where the target column value is 1
df_minority = df[df['target'] == 1]

# Display the shape of the minority class DataFrame
# The output is a tuple: (number of rows, number of columns)
# This helps confirm how many minority class samples are present
df_minority.shape

(100, 3)

In [19]:
# Create a new DataFrame that contains only the majority class samples
# This filters rows where the target column value is 0
df_majority = df[df['target'] == 0]

# Display the shape of the majority class DataFrame
# The output is a tuple: (number of rows, number of columns)
# This shows how many samples belong to the majority class
df_majority.shape

(900, 3)

In [20]:
# Import the resample function from scikit-learn
from sklearn.utils import resample

# Upsample the minority class to balance it with the majority class
df_minority_upsampled = resample(
    df_minority,              # The minority class DataFrame to be upsampled
    replace=True,             # Sample with replacement (allow duplicate rows)
    n_samples=len(df_majority),  # Generate enough samples to match the majority class size
    random_state=42           # Set random seed for reproducibility
)

In [21]:
#Shape of the new minority data sample
df_minority_upsampled.shape

(900, 3)

In [22]:
#head of them
df_minority_upsampled.head()

Unnamed: 0,feature_1,feature_2,target
951,1.125854,1.843917,1
992,2.19657,1.397425,1
914,1.93217,2.998053,1
971,2.272825,3.034197,1
960,2.870056,1.550485,1


In [25]:
df_upsampled = pd.concat([df_minority_upsampled,df_majority])

In [26]:
df_upsampled['target'].value_counts() #count of 0s and 1s in target column

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,900
0,900


In [None]:
#Down sampling

In [29]:
# Import pandas for data manipulation
import pandas as pd

# Import NumPy for numerical operations
import numpy as np

# Set a random seed for reproducibility
np.random.seed(123)

# Define the total number of samples
n_samples = 1000

# Define the proportion of samples belonging to class 0 (majority class)
class_0_ratio = 0.9

# Calculate number of samples for class 0
n_class_0 = int(n_samples * class_0_ratio)

# Calculate number of samples for class 1 (minority class)
n_class_1 = n_samples - n_class_0

# Create DataFrame for class 0
# Features are drawn from a normal distribution (mean=0, std=1)
# Target column is set to 0 for all rows
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

# Create DataFrame for class 1
# Features are drawn from a normal distribution (mean=2, std=1) to differentiate from class 0
# Target column is set to 1 for all rows
class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

# Combine the two DataFrames into a single dataset
# Reset the index so it runs from 0 to n-1
df = pd.concat([class_0, class_1]).reset_index(drop=True)

# Print the count of each class in the target column
# This confirms the dataset is imbalanced
print(df['target'].value_counts())

target
0    900
1    100
Name: count, dtype: int64


In [31]:
# Filter the original DataFrame to create a DataFrame containing only the minority class (class 1)
df_minority = df[df['target'] == 1]

# Filter the original DataFrame to create a DataFrame containing only the majority class (class 0)
df_majority = df[df['target'] == 0]

In [33]:
# Import the resample function from scikit-learn
from sklearn.utils import resample

# Downsample the majority class to match the size of the minority class
df_majority_downsampled = resample(
    df_majority,               # The majority class DataFrame to be downsampled
    replace=False,             # Sample without replacement (no duplicates)
    n_samples=len(df_minority),# Reduce majority class to the same number of samples as minority class
    random_state=42            # Set random seed for reproducibility
)

In [34]:
df_majority_downsampled.shape #Size of new majority which is downsampled

(100, 3)

In [36]:
df_downsampled = pd.concat([df_minority,df_majority_downsampled]) #Concat the new majority downsampled and old minority

In [37]:
df_downsampled.target.T.value_counts() #Count of 0s and 1s in target column

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,100
0,100
