1 .SMOTE to generate synthetic samples instead of duplicating existing ones.

2.Convert categorical class labels into numeric form for SMOTE to work.

3.Apply SMOTE to balance the dataset.

4.Convert back to original categorical labels.

5.Combine the resampled data into a final balanced dataset.

In [None]:

import pandas as pd
from imblearn.over_sampling import SMOTE
df = pd.DataFrame({
    'Age': [22, 25, 27, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4300, 4500, 4800, 5000, 5300, 5500],
    'Class': ['Minority', 'Minority', 'Minority', 'Minority', 'Majority', 'Majority', 'Majority', 'Majority', 
              'Majority', 'Majority', 'Majority', 'Majority', 'Majority']
})

In [3]:
# Step 1: Convert categorical labels to numerical values (Majority -> 0, Minority -> 1)
df['Class'] = df['Class'].map({'Majority': 0, 'Minority': 1})

# Step 2: Split features (X) and target variable (y)
X = df[['Age', 'Income']]  # Features
y = df['Class']  # Target variable

# Step 3: Apply SMOTE with k_neighbors=3 (reducing from default 5)
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 4: Convert numeric labels back to categorical
y_resampled = y_resampled.map({0: 'Majority', 1: 'Minority'})

# Step 5: Combine the resampled dataset
df_resampled = pd.DataFrame(X_resampled, columns=['Age', 'Income'])
df_resampled['Class'] = y_resampled

# Print the balanced dataset
print(df_resampled)


    Age  Income     Class
0    22    2000  Minority
1    25    2500  Minority
2    27    2700  Minority
3    28    3200  Minority
4    30    3500  Majority
5    35    3800  Majority
6    40    4000  Majority
7    45    4300  Majority
8    50    4500  Majority
9    55    4800  Majority
10   60    5000  Majority
11   65    5300  Majority
12   70    5500  Majority
13   26    2668  Minority
14   25    2531  Minority
15   27    3159  Minority
16   27    3133  Minority
17   23    2199  Minority
