In [17]:
import pandas as pd
from sklearn.utils import resample

df = pd.DataFrame({
    'Age': [22, 25, 27, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
    'Class': ['High', 'Low', 'Low', 'High', 'High', 'Low', 'High', 'High', 'Low','Low','High', 'High', 'Low']
})
df.value_counts()
df['Class'].value_counts()

Class
High    7
Low     6
Name: count, dtype: int64

In [18]:
df_high=df[df['Class']=='High']
df_low=df[df['Class']=='Low']

In [19]:
df_high_downsampled=resample(df_high,replace=False,n_samples=len(df_low),random_state=42)

In [20]:
df_balanced=pd.concat([df_high_downsampled,df_low])

In [21]:
print(df_balanced['Class'].value_counts())

Class
High    6
Low     6
Name: count, dtype: int64


<ins>**Upsampling the Low class**<ins>

In [22]:
df_low_upsampled=resample(df_low,replace=True,n_samples=len(df_high),random_state=42)
df_balance=pd.concat([df_low_upsampled,df_high])
print(df_balance['Class'].value_counts())

Class
Low     7
High    7
Name: count, dtype: int64


In [23]:
import pandas as pd
from sklearn.utils import resample

df = pd.DataFrame({
    'Age': [22, 25, 27, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
    'Class': ['Majority', 'Minority', 'Majority', 'Majority', 'Majority', 'Minority', 'Majority', 'Majority', 'Majority','Minority','Majority', 'Majority', 'Minority']
})
df.value_counts()
df['Class'].value_counts()

Class
Majority    9
Minority    4
Name: count, dtype: int64

In [24]:
df_high=df[df['Class']=='Majority']
df_low=df[df['Class']=='Minority']

In [25]:
df_minority_upsampled=resample(df_low,replace=True,n_samples=len(df_high),random_state=42)
df_bal=pd.concat([df_minority_upsampled,df_high])
print(df_bal['Class'].value_counts())

Class
Minority    9
Majority    9
Name: count, dtype: int64


**SMODE(Synthetic Minority Over Sampling Technique**

In [26]:
import pandas as pd
from sklearn.utils import resample

df = pd.DataFrame({
    'Age': [22, 25, 27, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
    'Class': ['Majority', 'Minority', 'Majority', 'Majority', 'Majority', 'Minority', 'Majority', 'Majority', 'Majority','Minority','Majority', 'Majority', 'Minority']
})
df.value_counts()
df['Class'].value_counts()

Class
Majority    9
Minority    4
Name: count, dtype: int64

In [28]:
!pip install imbalanced-learn




[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [29]:
!python.exe -m pip install --upgrade pip

Collecting pip
  Downloading pip-25.0-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ----------------- ---------------------- 0.8/1.8 MB 32.8 MB/s eta 0:00:01
   ---------------------------------------- 1.8/1.8 MB 4.6 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.3.1
    Uninstalling pip-24.3.1:
      Successfully uninstalled pip-24.3.1
Successfully installed pip-25.0


In [30]:
!pip install imbalanced-learn



In [1]:
import pandas as pd
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE  # Import SMOTE

# Create the DataFrame
df = pd.DataFrame({
    'Age': [22, 25, 27, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
    'Class': ['Majority', 'Minority', 'Majority', 'Majority', 'Majority', 'Minority', 'Majority', 'Majority', 'Majority', 'Minority', 'Majority', 'Majority', 'Minority']
})

# Convert the categorical labels to numerical values
df['Class'] = df['Class'].map({'Majority': 0, 'Minority': 1})

# Define the features (X) and target (y)
X = df[['Age', 'Income']]
y = df['Class']

# Apply SMOTE to balance the dataset
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Map resampled labels back to original categorical labels
y_resampled = y_resampled.map({0: 'Majority', 1: 'Minority'})

# Combine the resampled features and target into a balanced DataFrame
df_balanced = pd.concat([pd.DataFrame(X_resampled, columns=['Age', 'Income']), pd.DataFrame(y_resampled, columns=['Class'])], axis=1)

# Display the value counts of each class and the balanced DataFrame
print(df_balanced['Class'].value_counts())
print(df_balanced)


Class
Majority    9
Minority    9
Name: count, dtype: int64
    Age  Income     Class
0    22    2000  Majority
1    25    2500  Minority
2    27    2700  Majority
3    28    3200  Majority
4    30    3500  Majority
5    35    3800  Minority
6    40    4000  Majority
7    45    4200  Majority
8    50    4300  Majority
9    55    4500  Minority
10   60    5000  Majority
11   65    5500  Majority
12   70    6000  Minority
13   51    4390  Minority
14   38    3909  Minority
15   67    5872  Minority
16   67    5799  Minority
17   28    3018  Minority


In [40]:
!pip uninstall scikit-learn imbalanced-learn -y



In [41]:
pip install -U scikit-learn imbalanced-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   -------- ------------------------------- 2.4/11.1 MB 26.9 MB/s eta 0:00:01
   -------- ------------------------------- 2.4/11.1 MB 26.9 MB/s eta 0:00:01
   ----------------- ---------------------- 5.0/11.1 MB 11.2 MB/s eta 0:00:01
   ----------------- ---------------------- 5.0/11.1 MB 11.2 MB/s eta 0:00:01
   ----------------- ---------------------- 5.0/11.1 MB 11.2 MB/s eta 0:00:01
   ----------------- ----

In [43]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE  # Import SMOTE

# Create the DataFrame
df = pd.DataFrame({
    'Age': [22, 25, 27, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
    'Class': ['Majority', 'Minority', 'Majority', 'Majority', 'Majority', 'Minority', 'Majority', 'Majority', 'Majority', 'Minority', 'Majority', 'Majority', 'Minority']
})

# Convert the categorical labels to numerical values
df['Class'] = df['Class'].map({'Majority': 0, 'Minority': 1})

# Define the features (X) and target (y)
X = df[['Age', 'Income']]
y = df['Class']

# Apply SMOTE to balance the dataset
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Map resampled labels back to original categorical labels
y_resampled = y_resampled.map({0: 'Majority', 1: 'Minority'})

# Combine the resampled features and target into a balanced DataFrame
df_balanced = pd.concat([pd.DataFrame(X_resampled, columns=['Age', 'Income']), pd.DataFrame(y_resampled, columns=['Class'])], axis=1)

# Display the value counts of each class and the balanced DataFrame
print(df_balanced['Class'].value_counts())
print(df_balanced)


Class
Majority    9
Minority    9
Name: count, dtype: int64
    Age  Income     Class
0    22    2000  Majority
1    25    2500  Minority
2    27    2700  Majority
3    28    3200  Majority
4    30    3500  Majority
5    35    3800  Minority
6    40    4000  Majority
7    45    4200  Majority
8    50    4300  Majority
9    55    4500  Minority
10   60    5000  Majority
11   65    5500  Majority
12   70    6000  Minority
13   51    4390  Minority
14   38    3909  Minority
15   67    5872  Minority
16   67    5799  Minority
17   28    3018  Minority


In [5]:
import pandas as pd
from imblearn.over_sampling import SMOTE

df = pd.DataFrame({
    'Age': [22, 25, 27, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
    'Class': ['Minority', 'Majority', 'Majority', 'Majority', 'Majority', 'Minority', 'Minority', 'Minority', 'Majority',  'Majority', 'Majority', 'Majority', 'Majority']
})
df['Class'].value_counts()

#Step1: Convert categorical labels to numerical values
df['Class'] = df['Class'].map({'Majority': 0, 'Minority': 1})

#Step2: Split features (X) and target variable (y)
X = df[['Age', 'Income']]
y = df['Class']

#Step3: Apply SMOTE with k_neighbors=3 (reducing from default 5)
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X, y)

#Step4: Convert numeric labels back to categorical
X_resampled,y_resampled.map({'Majority': 0, 'Minority': 1})

#Step5: Combine the resampled dataset
df_balanced = pd.concat([pd.DataFrame(X_resampled, columns=['Age', 'Income']), pd.DataFrame(y_resampled, columns=['Class'])], axis=1)

#Step6: print class distribution
print(df_balanced['Class'].value_counts())

#Step7: Display the unsampled dataset
print(df_balanced)

Class
1    9
0    9
Name: count, dtype: int64
    Age  Income  Class
0    22    2000      1
1    25    2500      0
2    27    2700      0
3    28    3200      0
4    30    3500      0
5    35    3800      1
6    40    4000      1
7    45    4200      1
8    50    4300      0
9    55    4500      0
10   60    5000      0
11   65    5500      0
12   70    6000      0
13   40    4031      1
14   35    3831      1
15   44    4176      1
16   35    3826      1
17   41    4040      1
