In [2]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Using cached imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn->imblearn)
  Using cached sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Using cached imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Using cached sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.13.0 imblearn-0.0 sklearn-compat-0.1.3


In [7]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

def balance_to_majority(df, target_col='target', majority_class=8, random_state=42):
    """
    Oversamples all minority classes to match the majority class count (367 samples)
    
    Parameters:
    - df: Input DataFrame with features and target
    - target_col: Name of your target column (default: 'target')
    - majority_class: The class with maximum samples (default: 8)
    - random_state: For reproducibility
    
    Returns:
    - DataFrame with all classes having 367 samples
    """
    
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    # Get majority class count
    majority_count = y.value_counts()[majority_class]
    
    # Separate majority class
    df_majority = df[df[target_col] == majority_class]
    
    # Initialize list to store resampled classes
    dfs_resampled = [df_majority]
    
    # Process each minority class
    for class_label in y.unique():
        if class_label == majority_class:
            continue
            
        df_class = df[df[target_col] == class_label]
        n_samples = len(df_class)
        
        # Case 1: Class has 0 or 1 samples (can't use SMOTE)
        if n_samples <= 1:
            # Simple duplication with slight noise
            duplicated = [df_class.copy() for _ in range(majority_count)]
            df_resampled = pd.concat(duplicated, ignore_index=True)
            
            # Add small noise to avoid perfect duplicates
            for col in X.columns:
                if pd.api.types.is_numeric_dtype(df_resampled[col]):
                    df_resampled[col] += np.random.normal(0, 0.01, size=len(df_resampled))
                    
        # Case 2: Class has enough samples for SMOTE (>=2)
        else:
            smote = SMOTE(sampling_strategy={class_label: majority_count},
                         k_neighbors=min(n_samples-1, 5),  # Safe k_neighbors
                         random_state=random_state)
            
            X_res, y_res = smote.fit_resample(X, y)
            df_resampled = pd.DataFrame(X_res, columns=X.columns)
            df_resampled[target_col] = y_res
            df_resampled = df_resampled[df_resampled[target_col] == class_label]
        
        dfs_resampled.append(df_resampled)
    
    # Combine all classes
    balanced_df = pd.concat(dfs_resampled, ignore_index=True)
    
    return balanced_df

# Usage:
df = pd.read_csv(r'D:\Machine-Learning\ECG-Based Arrhythmia Detection\Preprocessing\PCA_Transformed.csv')
balanced_df = balance_to_majority(df)
print(balanced_df['target'].value_counts())
print(balanced_df.shape)

target
8     367
3     367
13    367
2     367
4     367
5     367
14    367
6     367
10    367
1     367
11    367
7     367
0     367
12    367
9     367
Name: count, dtype: int64
(5505, 30)


In [8]:
balanced_df.to_csv('balanced_ecg_data.csv', index=False)