**BALANCING A DATASET-SAMPLING**

In [2]:
import pandas as pd
from sklearn.utils import resample
df=pd.DataFrame({'Age':[22,25,28,29,30,35,40,45,50,55,60,65,70],
                'Income':[2000,2500,2700,3200,3500,3800,4000,4200,4300,4500,5000,5500,6000],
                'Class':['High','Low','Low','High','High','Low','High','High','Low','Low','High','High','Low']})

High Class :7 instances.

Low Class:6 instances.

In [3]:
#Seperate majority and minority classes
df_high=df[df['Class']=='High']
df_low=df[df['Class']=='Low']
print(df_high)
print(df_low)

    Age  Income Class
0    22    2000  High
3    29    3200  High
4    30    3500  High
6    40    4000  High
7    45    4200  High
10   60    5000  High
11   65    5500  High
    Age  Income Class
1    25    2500   Low
2    28    2700   Low
5    35    3800   Low
8    50    4300   Low
9    55    4500   Low
12   70    6000   Low


**Downsample majority class**

In [4]:

df_high_downsampled=resample(df_high,replace=False,n_samples=len(df_low),random_state=42)


In [7]:
#Combine downsampled majority with minority class
df_balanced=pd.concat([df_high_downsampled,df_low])
print(df_high_downsampled)

    Age  Income Class
0    22    2000  High
3    29    3200  High
10   60    5000  High
4    30    3500  High
7    45    4200  High
6    40    4000  High


In [6]:
print(df_balanced['Class'].value_counts())

Class
High    6
Low     6
Name: count, dtype: int64


**Upsampling the Minority Class**

In [10]:
import pandas as pd
from sklearn.utils import resample
df=pd.DataFrame({'Age':[22,25,28,29,30,35,40,45,50,55,60,65,70],
                'Income':[2000,2500,2700,3200,3500,3800,4000,4200,4300,4500,5000,5500,6000],
                'Class':['Minority','Majority','Majority','Majority','Majority','Minority','Minority','Minority','Majority','Majority','Majority','Majority','Majority']})

**Majority class has 9 instances**

**Minority class has 4 instances**

In [11]:
df_majority=df[df['Class']=='Majority']
df_minority=df[df['Class']=='Minority']
print(df_majority)
print(df_minority)

    Age  Income     Class
1    25    2500  Majority
2    28    2700  Majority
3    29    3200  Majority
4    30    3500  Majority
8    50    4300  Majority
9    55    4500  Majority
10   60    5000  Majority
11   65    5500  Majority
12   70    6000  Majority
   Age  Income     Class
0   22    2000  Minority
5   35    3800  Minority
6   40    4000  Minority
7   45    4200  Minority


In [21]:
#Upsample minority class
df_minority_upsampled=resample(df_minority,replace=True,n_samples=len(df_majority),random_state=42)
df_minority_upsampled

Unnamed: 0,Age,Income,Class
6,40,4000,Minority
7,45,4200,Minority
0,22,2000,Minority
6,40,4000,Minority
6,40,4000,Minority
7,45,4200,Minority
0,22,2000,Minority
0,22,2000,Minority
6,40,4000,Minority


In [23]:
#Combine upsampled minority with majority class
df_balanced=pd.concat([df_majority,df_minority_upsampled])

In [20]:
print(df_balanced['Class'].value_counts())

Class
Majority    9
Minority    9
Name: count, dtype: int64


In [27]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


## Smote(Synthetic Minority Over-sampling Technique)
1.smote generates synthetic examples by its own without taking dupliacte values

2.convert categorical class labels into numeric form for SMOTE to work.

3.Apply smote to balance dataset.

4.Convert back to original categorical labels

5.combine the resampled data into a final balanced dataset.

## pip uninstall scikit-learn imbalanced-learn -y
## pip install -U scikit-learn imbalanced-learn

In [30]:
pip install -U scikit-learn imbalanced-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/a1/a6/c5b78606743a1f28eae8f11973de6613a5ee87366796583fb74c67d54939/scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata
  Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.6.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
df=pd.DataFrame({'Age':[22,25,28,29,30,35,40,45,50,55,60,65,70],
                'Income':[2000,2500,2700,3200,3500,3800,4000,4200,4300,4500,5000,5500,6000],
                'Class':['Minority','Majority','Majority','Majority','Majority','Minority','Minority','Minority','Majority','Majority','Majority','Majority','Majority']})

In [2]:
df['Class'].value_counts()

#Step1: Convert categorical labels to numerical values
df['Class'] = df['Class'].map({'Majority': 0, 'Minority': 1})

#Step2: Split features (X) and target variable (y)
X = df[['Age', 'Income']]
y = df['Class']

#Step3: Apply SMOTE with k_neighbors=3 (reducing from default 5)
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X, y)

#Step4: Convert numeric labels back to categorical
X_resampled,y_resampled.map({'Majority': 0, 'Minority': 1})

#Step5: Combine the resampled dataset
df_balanced = pd.concat([pd.DataFrame(X_resampled, columns=['Age', 'Income']), pd.DataFrame(y_resampled, columns=['Class'])], axis=1)

#Step6: print class distribution
print(df_balanced['Class'].value_counts())

#Step7: Display the unsampled dataset
print(df_balanced)

Class
1    9
0    9
Name: count, dtype: int64
    Age  Income  Class
0    22    2000      1
1    25    2500      0
2    28    2700      0
3    29    3200      0
4    30    3500      0
5    35    3800      1
6    40    4000      1
7    45    4200      1
8    50    4300      0
9    55    4500      0
10   60    5000      0
11   65    5500      0
12   70    6000      0
13   40    4031      1
14   35    3831      1
15   44    4176      1
16   35    3826      1
17   41    4040      1
