**With Iterative train_test split**




In [5]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.preprocessing import StandardScaler
from skmultilearn.model_selection import iterative_train_test_split

def create_label_combinations(y):
    """Convert multi-label data into unique combination patterns"""
    return np.array([''.join(map(str, row)) for row in y.astype(int).values])

def oversample_multilabel(X, y, random_state=42):
    """
    Perform oversampling for multi-label data while preserving label relationships
    Uses SMOTE for numeric features
    """
    # Convert labels to label combination patterns
    label_combinations = create_label_combinations(y)
    
    # Get counts of each unique combination
    combination_counts = Counter(label_combinations)
    
    # Determine if we have enough samples for SMOTE
    min_samples = min(combination_counts.values())
    
    # Choose oversampling strategy based on available samples
    if min_samples >= 5:  # SMOTE requires at least k+1 samples (default k=5)
        oversample = SMOTE(random_state=random_state, k_neighbors=min(min_samples-1, 5))
    else:
        oversample = RandomOverSampler(random_state=random_state)
    
    # Create a temporary dataframe with features and label combinations
    temp_df = pd.DataFrame(X.copy())
    temp_df['label_combination'] = label_combinations
    
    # Perform oversampling
    X_resampled_temp, y_resampled_temp = oversample.fit_resample(
        temp_df.drop('label_combination', axis=1),
        temp_df['label_combination']
    )
    
    # Convert back to original format
    y_resampled = pd.DataFrame([list(map(int, combo)) for combo in y_resampled_temp],
                              columns=y.columns)
    
    return X_resampled_temp, y_resampled
# Load Dataset
df = pd.read_csv('Data/New-Class-smell.csv')
print("Shape of original DataFrame: ", df.shape)

# Drop Address Column
df = df.drop(columns=['Address','Hierarchy Duplication','Futile Abstract Pipeline','Futile Hierarchy'])
 
# Check for Missing Values
missing_values = df.isnull().sum()
print("\nMissing Values:\n", missing_values[missing_values > 0])

# Drop rows where any label in smell_columns is NaN
label_columns = [
    'Brain Class', 'Data Class', 
    'God Class', 'Schizofrenic Class', 'Model Class'
]



# Check for Duplicate Rows
duplicates = df[df.duplicated()]
print(f"\nDuplicate Rows Found: {duplicates.shape[0]}")
df = df.drop_duplicates()
print("Shape after removing duplicate rows: ", df.shape)

# Separate Features and Labels
X = df.drop(columns=label_columns)
y = df[label_columns]
#Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply oversampling
X_resampled, y_resampled = oversample_multilabel(X_scaled, y)

# Split into training and testing sets
X_train, y_train, X_test, y_test = iterative_train_test_split(
    X_resampled, y_resampled.values, test_size=0.2
)

# Function to print label statistics
def print_label_statistics(y, title):
    print(f"\n{title}")
    for column in y.columns:
        print(f"\nValue counts for {column}:")
        if isinstance(y, pd.DataFrame):
            print(y[column].value_counts())
        else:
            print(pd.Series(y[:, y.columns.get_loc(column)]).value_counts())
            
    # Print imbalance ratios
    print("\nImbalance Ratios:")
    for column in y.columns:
        if isinstance(y, pd.DataFrame):
            counts = y[column].value_counts()
        else:
            counts = pd.Series(y[:, y.columns.get_loc(column)]).value_counts()
        ratio = counts.max() / counts.min()
        print(f"{column}: 1:{ratio:.2f}")

# Print statistics
print_label_statistics(y, "Original Dataset Statistics:")
print_label_statistics(pd.DataFrame(y_resampled, columns=y.columns), 
                      "After Oversampling Statistics:")

Shape of original DataFrame:  (373400, 50)

Missing Values:
 Series([], dtype: int64)

Duplicate Rows Found: 134040
Shape after removing duplicate rows:  (239360, 46)


KeyboardInterrupt: 

**With Normal Split**

In [6]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split  # Changed from skmultilearn

def create_label_combinations(y):
    """Convert multi-label data into unique combination patterns"""
    return np.array([''.join(map(str, row)) for row in y.astype(int).values])

def oversample_multilabel(X, y, random_state=42):
    """
    Perform oversampling for multi-label data while preserving label relationships
    Uses SMOTE for numeric features
    """
    # Convert labels to label combination patterns
    label_combinations = create_label_combinations(y)
    
    # Get counts of each unique combination
    combination_counts = Counter(label_combinations)
    
    # Determine if we have enough samples for SMOTE
    min_samples = min(combination_counts.values())
    
    # Choose oversampling strategy based on available samples
    if min_samples >= 5:
        oversample = SMOTE(random_state=random_state, k_neighbors=min(min_samples-1, 5))
    else:
        oversample = RandomOverSampler(random_state=random_state)
    
    # Perform oversampling directly without creating temporary dataframe
    X_resampled_temp, y_resampled_temp = oversample.fit_resample(X, label_combinations)
    
    # Convert back to original format
    y_resampled = pd.DataFrame([list(map(int, combo)) for combo in y_resampled_temp],
                              columns=y.columns)
    
    return X_resampled_temp, y_resampled

# Load and preprocess data
df = pd.read_csv('Data/New-Class-smell.csv')
df = df.drop(columns=['Address', 'Hierarchy Duplication', 'Futile Abstract Pipeline', 'Futile Hierarchy'])

# Define label columns
label_columns = [
    'Brain Class', 'Data Class', 
    'God Class', 'Schizofrenic Class', 'Model Class'
]

# Remove duplicates
df = df.drop_duplicates()

# Separate features and labels
X = df.drop(columns=label_columns)
y = df[label_columns]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply oversampling
X_resampled, y_resampled = oversample_multilabel(X_scaled, y)

# Use regular train_test_split instead of iterative_train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, 
    y_resampled,
    test_size=0.2,
    random_state=42,
    stratify=create_label_combinations(y_resampled)  # Stratify by label combinations
)

# Function to print label statistics
def print_label_statistics(y, title):
    print(f"\n{title}")
    for column in y.columns:
        print(f"\nValue counts for {column}:")
        print(y[column].value_counts())
            
    # Print imbalance ratios
    print("\nImbalance Ratios:")
    for column in y.columns:
        counts = y[column].value_counts()
        ratio = counts.max() / counts.min()
        print(f"{column}: 1:{ratio:.2f}")

# Print statistics
print_label_statistics(y, "Original Dataset Statistics:")
print_label_statistics(y_resampled, "After Oversampling Statistics:")


Original Dataset Statistics:

Value counts for Brain Class:
Brain Class
False    238526
True        834
Name: count, dtype: int64

Value counts for Data Class:
Data Class
False    220974
True      18386
Name: count, dtype: int64

Value counts for God Class:
God Class
False    235824
True       3536
Name: count, dtype: int64

Value counts for Schizofrenic Class:
Schizofrenic Class
False    220037
True      19323
Name: count, dtype: int64

Value counts for Model Class:
Model Class
True     238985
False       375
Name: count, dtype: int64

Imbalance Ratios:
Brain Class: 1:286.00
Data Class: 1:12.02
God Class: 1:66.69
Schizofrenic Class: 1:11.39
Model Class: 1:637.29

After Oversampling Statistics:

Value counts for Brain Class:
Brain Class
0    1791765
1     398170
Name: count, dtype: int64

Value counts for Data Class:
Data Class
0    1592680
1     597255
Name: count, dtype: int64

Value counts for God Class:
God Class
0    1791765
1     398170
Name: count, dtype: int64

Value counts fo

**Usaing SMOTE and added Label correlation**

In [11]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency

def analyze_label_relationships(y, title="Label Relationships"):
    """Analyze and print relationships between labels"""
    print(f"\n{title}")
    n_labels = y.shape[1]
    relationships = pd.DataFrame(index=y.columns, columns=y.columns)
    
    for i in range(n_labels):
        for j in range(i+1, n_labels):
            # Create contingency table
            contingency = pd.crosstab(y.iloc[:,i], y.iloc[:,j])
            # Calculate chi-square test
            chi2, p_value, _, _ = chi2_contingency(contingency)
            relationships.iloc[i,j] = p_value
            relationships.iloc[j,i] = p_value
    
    print("\nLabel correlation p-values (lower = stronger relationship):")
    print(relationships)
    return relationships

def create_label_combinations(y):
    """Convert multi-label data into unique combination patterns"""
    return np.array([''.join(map(str, row)) for row in y.astype(int).values])

def validate_label_preservation(original_y, resampled_y, threshold=0.05):
    """
    Validate that label relationships are preserved after resampling
    Returns True if relationships are preserved
    """
    orig_relationships = analyze_label_relationships(original_y, "Original Relationships")
    new_relationships = analyze_label_relationships(resampled_y, "Resampled Relationships")
    
    # Compare relationship strengths
    relationship_preserved = True
    for i in range(len(original_y.columns)):
        for j in range(i+1, len(original_y.columns)):
            orig_sig = orig_relationships.iloc[i,j] < threshold
            new_sig = new_relationships.iloc[i,j] < threshold
            if orig_sig != new_sig:
                print(f"\nWarning: Relationship changed between {original_y.columns[i]} and {original_y.columns[j]}")
                relationship_preserved = False
    
    return relationship_preserved

def oversample_multilabel(X, y, random_state=42):
    """
    Perform oversampling for multi-label data while preserving label relationships
    """
    # Convert labels to label combination patterns
    label_combinations = create_label_combinations(y)
    
    # Get counts of each unique combination
    combination_counts = Counter(label_combinations)
    
    # Calculate target number for each combination
    max_count = max(combination_counts.values())
    sampling_strategy = {k: max_count for k in combination_counts.keys()}
    
    # Determine if we have enough samples for SMOTE
    min_samples = min(combination_counts.values())
    
    if min_samples >= 5:
        oversample = SMOTE(
            random_state=random_state,
            k_neighbors=min(min_samples-1, 5),
            sampling_strategy=sampling_strategy
        )
    else:
        oversample = RandomOverSampler(
            random_state=random_state,
            sampling_strategy=sampling_strategy
        )
    
    # Perform oversampling
    X_resampled_temp, y_resampled_temp = oversample.fit_resample(X, label_combinations)
    
    # Convert back to original format
    y_resampled = pd.DataFrame([list(map(int, combo)) for combo in y_resampled_temp],
                              columns=y.columns)
    
    # Validate preservation of label relationships
    if not validate_label_preservation(y, y_resampled):
        print("\nWarning: Some label relationships may have been altered during oversampling")
    
    return X_resampled_temp, y_resampled

# Load and preprocess data
df = pd.read_csv('Data/New-Class-smell.csv')
df = df.drop(columns=['Address', 'Hierarchy Duplication', 'Futile Abstract Pipeline', 'Futile Hierarchy'])

# Define label columns
label_columns = [
    'Brain Class', 'Data Class', 
    'God Class', 'Schizofrenic Class', 'Model Class'
]

# Remove duplicates
df = df.drop_duplicates()

# Separate features and labels
X = df.drop(columns=label_columns)
y = df[label_columns]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Print initial label relationships
print("\nAnalyzing initial label relationships...")
analyze_label_relationships(y, "Initial Label Relationships")

# Apply oversampling
X_resampled, y_resampled = oversample_multilabel(X_scaled, y)

# Use train_test_split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, 
    y_resampled,
    test_size=0.2,
    random_state=42,
    stratify=create_label_combinations(y_resampled)
)

def print_label_statistics(y, title):
    print(f"\n{title}")
    for column in y.columns:
        print(f"\nValue counts for {column}:")
        print(y[column].value_counts())
            
    print("\nImbalance Ratios:")
    for column in y.columns:
        counts = y[column].value_counts()
        ratio = counts.max() / counts.min()
        print(f"{column}: 1:{ratio:.2f}")

# Print statistics
print_label_statistics(y, "Original Dataset Statistics:")
print_label_statistics(y_resampled, "After Oversampling Statistics:")


Analyzing initial label relationships...

Initial Label Relationships

Label correlation p-values (lower = stronger relationship):
                   Brain Class Data Class God Class Schizofrenic Class  \
Brain Class                NaN        0.0  0.000677                0.0   
Data Class                 0.0        NaN       0.0                0.0   
God Class             0.000677        0.0       NaN                0.0   
Schizofrenic Class         0.0        0.0       0.0                NaN   
Model Class           0.479294        0.0  0.030856           0.000369   

                   Model Class  
Brain Class           0.479294  
Data Class                 0.0  
God Class             0.030856  
Schizofrenic Class    0.000369  
Model Class                NaN  

Original Relationships

Label correlation p-values (lower = stronger relationship):
                   Brain Class Data Class God Class Schizofrenic Class  \
Brain Class                NaN        0.0  0.000677               

**Ssing Cluster-based SMOTE, added ratios for more controlled sampling and Validation of Label Relationships**

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import KMeansSMOTE, RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency
from sklearn.metrics import hamming_loss, jaccard_score

def analyze_label_relationships(y, title="Label Relationships"):
    """Analyze and print relationships between labels"""
    print(f"\n{title}")
    n_labels = y.shape[1]
    relationships = pd.DataFrame(index=y.columns, columns=y.columns)
    
    for i in range(n_labels):
        for j in range(i+1, n_labels):
            contingency = pd.crosstab(y.iloc[:,i], y.iloc[:,j])
            chi2, p_value, _, _ = chi2_contingency(contingency)
            relationships.iloc[i,j] = p_value
            relationships.iloc[j,i] = p_value
    
    print("\nLabel correlation p-values (lower = stronger relationship):")
    print(relationships)
    return relationships

def create_label_combinations(y):
    """Convert multi-label data into unique combination patterns"""
    return np.array([''.join(map(str, row)) for row in y.astype(int).values])

def calculate_multilabel_metrics(y_true, y_pred):
    """Calculate multilabel-specific metrics"""
    metrics = {
        'hamming_loss': hamming_loss(y_true, y_pred),
        'jaccard_score': jaccard_score(y_true, y_pred, average='samples'),
        'label_correlation': analyze_label_relationships(pd.DataFrame(y_pred, columns=y_true.columns))
    }
    return metrics

def validate_label_preservation(original_y, resampled_y, threshold=0.05):
    """Validate that label relationships are preserved after resampling"""
    metrics = calculate_multilabel_metrics(original_y, resampled_y[:len(original_y)])
    
    orig_relationships = analyze_label_relationships(original_y, "Original Relationships")
    new_relationships = analyze_label_relationships(resampled_y, "Resampled Relationships")
    
    relationship_preserved = True
    for i in range(len(original_y.columns)):
        for j in range(i+1, len(original_y.columns)):
            orig_sig = orig_relationships.iloc[i,j] < threshold
            new_sig = new_relationships.iloc[i,j] < threshold
            if orig_sig != new_sig:
                print(f"\nWarning: Relationship changed between {original_y.columns[i]} and {original_y.columns[j]}")
                relationship_preserved = False
    
    print("\nMultilabel Metrics:")
    print(f"Hamming Loss: {metrics['hamming_loss']:.4f}")
    print(f"Jaccard Score: {metrics['jaccard_score']:.4f}")
    
    return relationship_preserved

def oversample_multilabel(X, y, random_state=42):
    """Perform oversampling using Cluster-based SMOTE with custom ratios"""
    # Define custom sampling ratios based on domain knowledge
    sampling_ratios = {
        'Brain Class': 0.3,    # More aggressive for very rare class
        'Model Class': 0.3,    # More aggressive for very rare class
        'God Class': 0.2,      # Moderate oversampling
        'Schizofrenic Class': 0.1,  # Less aggressive
        'Data Class': 0.15     # Moderate oversampling
    }
    
    # Convert label combinations for cluster-based sampling
    label_combinations = create_label_combinations(y)
    
    # Calculate sampling strategy based on combinations
    combination_counts = Counter(label_combinations)
    max_count = max(combination_counts.values())
    
    # Adjust sampling strategy based on custom ratios
    sampling_strategy = {}
    for combo in combination_counts.keys():
        # Calculate the target count based on the labels present in this combination
        combo_labels = [int(c) for c in combo]
        ratio_sum = sum(sampling_ratios[col] for col, present in zip(y.columns, combo_labels) if present)
        if ratio_sum > 0:
            target_count = int(max_count * (ratio_sum / len(sampling_ratios)))
            if target_count > combination_counts[combo]:
                sampling_strategy[combo] = target_count
    
    try:
        # Try Cluster-based SMOTE first
        oversample = KMeansSMOTE(
            sampling_strategy=sampling_strategy,
            random_state=random_state,
            cluster_balance_threshold=0.05,
            k_neighbors=min(5, min(combination_counts.values())-1),
            n_jobs=-1
        )
        X_resampled_temp, y_resampled_temp = oversample.fit_resample(X, label_combinations)
    except Exception as e:
        print(f"KMeansSMOTE failed, falling back to RandomOverSampler: {str(e)}")
        # Fallback to RandomOverSampler if SMOTE fails
        oversample = RandomOverSampler(
            sampling_strategy=sampling_strategy,
            random_state=random_state
        )
        X_resampled_temp, y_resampled_temp = oversample.fit_resample(X, label_combinations)
    
    # Convert back to original format
    y_resampled = pd.DataFrame([list(map(int, combo)) for combo in y_resampled_temp],
                              columns=y.columns)
    
    # Validate preservation of label relationships and calculate metrics
    if not validate_label_preservation(y, y_resampled):
        print("\nWarning: Some label relationships may have been altered during oversampling")
    
    return X_resampled_temp, y_resampled

# Load and preprocess data
df = pd.read_csv('Data/New-Class-smell.csv')
df = df.drop(columns=['Address', 'Hierarchy Duplication', 'Futile Abstract Pipeline', 'Futile Hierarchy'])

label_columns = [
    'Brain Class', 'Data Class', 
    'God Class', 'Schizofrenic Class', 'Model Class'
]

df = df.drop_duplicates()
X = df.drop(columns=label_columns)
y = df[label_columns]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Print initial label relationships
print("\nAnalyzing initial label relationships...")
analyze_label_relationships(y, "Initial Label Relationships")

# Apply enhanced oversampling
X_resampled, y_resampled = oversample_multilabel(X_scaled, y)

# Use train_test_split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, 
    y_resampled,
    test_size=0.2,
    random_state=42,
    stratify=create_label_combinations(y_resampled)
)

def print_label_statistics(y, title):
    print(f"\n{title}")
    for column in y.columns:
        print(f"\nValue counts for {column}:")
        print(y[column].value_counts())
            
    print("\nImbalance Ratios:")
    for column in y.columns:
        counts = y[column].value_counts()
        ratio = counts.max() / counts.min()
        print(f"{column}: 1:{ratio:.2f}")

# Print statistics
print_label_statistics(y, "Original Dataset Statistics:")
print_label_statistics(y_resampled, "After Oversampling Statistics:")


Analyzing initial label relationships...

Initial Label Relationships

Label correlation p-values (lower = stronger relationship):
                   Brain Class Data Class God Class Schizofrenic Class  \
Brain Class                NaN        0.0  0.000677                0.0   
Data Class                 0.0        NaN       0.0                0.0   
God Class             0.000677        0.0       NaN                0.0   
Schizofrenic Class         0.0        0.0       0.0                NaN   
Model Class           0.479294        0.0  0.030856           0.000369   

                   Model Class  
Brain Class           0.479294  
Data Class                 0.0  
God Class             0.030856  
Schizofrenic Class    0.000369  
Model Class                NaN  
KMeansSMOTE failed, falling back to RandomOverSampler: The 'k_neighbors' parameter of KMeansSMOTE must be an int in the range [1, inf) or an object implementing 'kneighbors' and 'kneighbors_graph'. Got 0 instead.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Label Relationships

Label correlation p-values (lower = stronger relationship):
                   Brain Class Data Class God Class Schizofrenic Class  \
Brain Class                NaN        0.0  0.000677                0.0   
Data Class                 0.0        NaN       0.0                0.0   
God Class             0.000677        0.0       NaN                0.0   
Schizofrenic Class         0.0        0.0       0.0                NaN   
Model Class           0.479294        0.0  0.030856           0.000369   

                   Model Class  
Brain Class           0.479294  
Data Class                 0.0  
God Class             0.030856  
Schizofrenic Class    0.000369  
Model Class                NaN  

Original Relationships

Label correlation p-values (lower = stronger relationship):
                   Brain Class Data Class God Class Schizofrenic Class  \
Brain Class                NaN        0.0  0.000677                0.0   
Data Class                 0.0        NaN 

**Using Multilabel Random OverSampling insted of SMOTE**

In [10]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency


def analyze_label_relationships(y, title="Label Relationships"):
    """Analyze and print relationships between labels"""
    print(f"\n{title}")
    n_labels = y.shape[1]
    relationships = pd.DataFrame(index=y.columns, columns=y.columns)
    
    for i in range(n_labels):
        for j in range(i+1, n_labels):
            # Create contingency table
            contingency = pd.crosstab(y.iloc[:,i], y.iloc[:,j])
            # Calculate chi-square test
            chi2, p_value, _, _ = chi2_contingency(contingency)
            relationships.iloc[i,j] = p_value
            relationships.iloc[j,i] = p_value
    
    print("\nLabel correlation p-values (lower = stronger relationship):")
    print(relationships)
    return relationships

def create_label_combinations(y):
    """Convert multi-label data into unique combination patterns"""
    return np.array([''.join(map(str, row)) for row in y.astype(int).values])

def validate_label_preservation(original_y, resampled_y, threshold=0.05):
    """
    Validate that label relationships are preserved after resampling
    Returns True if relationships are preserved
    """
    orig_relationships = analyze_label_relationships(original_y, "Original Relationships")
    new_relationships = analyze_label_relationships(resampled_y, "Resampled Relationships")
    
    # Compare relationship strengths
    relationship_preserved = True
    for i in range(len(original_y.columns)):
        for j in range(i+1, len(original_y.columns)):
            orig_sig = orig_relationships.iloc[i,j] < threshold
            new_sig = new_relationships.iloc[i,j] < threshold
            if orig_sig != new_sig:
                print(f"\nWarning: Relationship changed between {original_y.columns[i]} and {original_y.columns[j]}")
                relationship_preserved = False
    
    return relationship_preserved

def ml_ros(X, y, random_state=42):
    """
    Multilabel Random OverSampling
    Preserves label relationships better than SMOTE for multilabel data
    """
    X = pd.DataFrame(X)
    combinations = create_label_combinations(y)
    unique_combinations = np.unique(combinations)
    
    # Get the majority combination count
    combination_counts = Counter(combinations)
    max_count = max(combination_counts.values())
    
    X_resampled = []
    y_resampled = []
    
    for combo in unique_combinations:
        # Get indices for this combination
        indices = np.where(combinations == combo)[0]
        n_samples = len(indices)
        
        # Number of samples needed
        n_needed = max_count - n_samples
        
        # Add original samples
        X_resampled.append(X.iloc[indices])
        y_resampled.append(y.iloc[indices])
        
        if n_needed > 0:
            # Random oversampling with replacement
            resample_idx = np.random.RandomState(random_state).choice(
                indices, size=n_needed, replace=True
            )
            X_resampled.append(X.iloc[resample_idx])
            y_resampled.append(y.iloc[resample_idx])
    
    X_resampled = pd.concat(X_resampled, axis=0)
    y_resampled = pd.concat(y_resampled, axis=0)
    
    return X_resampled.values, y_resampled
df = pd.read_csv('Data/New-Class-smell.csv')
df = df.drop(columns=['Address', 'Hierarchy Duplication', 'Futile Abstract Pipeline', 'Futile Hierarchy'])

label_columns = [
    'Brain Class', 'Data Class', 
    'God Class', 'Schizofrenic Class', 'Model Class'
]

df = df.drop_duplicates()
X = df.drop(columns=label_columns)
y = df[label_columns]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Print initial label relationships
print("\nAnalyzing initial label relationships...")
analyze_label_relationships(y, "Initial Label Relationships")

# Apply ML-ROS instead of SMOTE
X_resampled, y_resampled = ml_ros(X_scaled, y)

# Validate preservation of label relationships
validate_label_preservation(y, y_resampled)

# Use train_test_split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, 
    y_resampled,
    test_size=0.2,
    random_state=42,
    stratify=create_label_combinations(y_resampled)
)

# Print statistics
print_label_statistics(y, "Original Dataset Statistics:")
print_label_statistics(pd.DataFrame(y_resampled, columns=y.columns), 
                      "After Oversampling Statistics:")


Analyzing initial label relationships...

Initial Label Relationships

Label correlation p-values (lower = stronger relationship):
                   Brain Class Data Class God Class Schizofrenic Class  \
Brain Class                NaN        0.0  0.000677                0.0   
Data Class                 0.0        NaN       0.0                0.0   
God Class             0.000677        0.0       NaN                0.0   
Schizofrenic Class         0.0        0.0       0.0                NaN   
Model Class           0.479294        0.0  0.030856           0.000369   

                   Model Class  
Brain Class           0.479294  
Data Class                 0.0  
God Class             0.030856  
Schizofrenic Class    0.000369  
Model Class                NaN  

Original Relationships

Label correlation p-values (lower = stronger relationship):
                   Brain Class Data Class God Class Schizofrenic Class  \
Brain Class                NaN        0.0  0.000677               