In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency
import pickle

def analyze_label_relationships(y, title="Label Relationships"):
    """Analyze and print relationships between labels"""
    print(f"\n{title}")
    n_labels = y.shape[1]
    relationships = pd.DataFrame(index=y.columns, columns=y.columns)
    
    for i in range(n_labels):
        for j in range(i+1, n_labels):
            # Create contingency table
            contingency = pd.crosstab(y.iloc[:,i], y.iloc[:,j])
            # Calculate chi-square test
            chi2, p_value, _, _ = chi2_contingency(contingency)
            relationships.iloc[i,j] = p_value
            relationships.iloc[j,i] = p_value
    
    print("\nLabel correlation p-values (lower = stronger relationship):")
    print(relationships)
    return relationships

def print_label_statistics(y, title):
    print(f"\n{title}")
    for column in y.columns:
        print(f"\nValue counts for {column}:")
        print(y[column].value_counts())
            
    print("\nImbalance Ratios:")
    for column in y.columns:
        counts = y[column].value_counts()
        ratio = counts.max() / counts.min()
        print(f"{column}: 1:{ratio:.2f}")

def validate_label_preservation(original_y, resampled_y, threshold=0.05):
    """
    Validate that label relationships are preserved after resampling
    Returns True if relationships are preserved
    """
    orig_relationships = analyze_label_relationships(original_y, "Original Relationships")
    new_relationships = analyze_label_relationships(resampled_y, "Resampled Relationships")
    
    # Compare relationship strengths
    relationship_preserved = True
    for i in range(len(original_y.columns)):
        for j in range(i+1, len(original_y.columns)):
            orig_sig = orig_relationships.iloc[i,j] < threshold
            new_sig = new_relationships.iloc[i,j] < threshold
            if orig_sig != new_sig:
                print(f"\nWarning: Relationship changed between {original_y.columns[i]} and {original_y.columns[j]}")
                relationship_preserved = False
    
    return relationship_preserved

def get_target_ratio(class_name, minority_count, majority_count):
    """
    Calculate target ratio based on class-specific characteristics
    Returns target ratio and whether to use aggressive sampling
    """
    current_ratio = minority_count / majority_count
    
    # Class-specific ratio targets based on domain knowledge and original imbalance
    ratio_targets = {
        'Brain Class': 0.15,      # Target ~1:6.67 ratio (severe imbalance: 834:238526)
        'Data Class': 0.4,        # Target ~1:2.5 ratio (moderate imbalance: 18386:220974)
        'God Class': 0.2,         # Target ~1:5 ratio (significant imbalance: 3536:235824)
        'Schizofrenic Class': 0.4,# Target ~1:2.5 ratio (moderate imbalance: 19323:220037)
        'Model Class': 0.15       # Target ~1:6.67 ratio (severe imbalance: 375:238985)
    }
    
    # Get target ratio for this class
    target = ratio_targets.get(class_name, 0.3)  # Default 0.3 if class not found
    
    # Determine if aggressive sampling is needed based on severity of imbalance
    use_aggressive = current_ratio < 0.01  # For extremely imbalanced cases
    
    return target, use_aggressive

def create_label_combinations(y):
    """Convert multi-label data into unique combination patterns"""
    return np.array([''.join(map(str, row)) for row in y.astype(int).values])

def oversample_multilabel(X, y, random_state=42):
    """
    Perform oversampling for multi-label data with class-specific controlled ratios
    """
    # Convert labels to label combination patterns
    label_combinations = create_label_combinations(y)
    combination_counts = Counter(label_combinations)
    
    # Calculate base sampling strategy
    sampling_strategy = {}
    max_count = max(combination_counts.values())
    
    # Analyze each combination to determine appropriate sampling
    for combo, count in combination_counts.items():
        combo_labels = list(map(int, combo))
        
        # Calculate target based on the most severe imbalance in this combination
        max_target_ratio = 0
        for i, (label, present) in enumerate(zip(y.columns, combo_labels)):
            if present:  # Only consider classes that are present in this combination
                minority_count = count
                majority_count = max_count
                target_ratio, is_aggressive = get_target_ratio(label, minority_count, majority_count)
                if is_aggressive:
                    target_ratio *= 1.2  # Boost ratio by 20% for aggressive cases
                max_target_ratio = max(max_target_ratio, target_ratio)
        
        # Calculate target number of samples
        target = int(max_count * max_target_ratio)
        if target > count:  # Only oversample if target is higher than current count
            sampling_strategy[combo] = target
    
    # Determine if we have enough samples for SMOTE
    min_samples = min(combination_counts.values())
    
    if min_samples >= 5:
        oversample = SMOTE(
            random_state=random_state,
            k_neighbors=min(min_samples-1, 5),
            sampling_strategy=sampling_strategy
            
        )
        print(f"\nUSING SMOTE")
    else:
        oversample = RandomOverSampler(
            random_state=random_state,
            sampling_strategy=sampling_strategy
            
        
        )
        print(f"\nUSING RANDOMOVERSAMPLER")
    
    # Perform oversampling
    X_resampled_temp, y_resampled_temp = oversample.fit_resample(X, label_combinations)
    
    # Convert back to original format
    y_resampled = pd.DataFrame([list(map(int, combo)) for combo in y_resampled_temp],
                              columns=y.columns)
    
    return X_resampled_temp, y_resampled
# Load and preprocess data
df = pd.read_csv('Data/New-Class-smell.csv')
df = df.drop(columns=['Address', 'Hierarchy Duplication', 'Futile Abstract Pipeline', 'Futile Hierarchy','CRIX','Model Class'])

# Define label columns
label_columns = [
    'Brain Class', 'Data Class', 
    'God Class', 'Schizofrenic Class', 
]

# Remove duplicates
df = df.drop_duplicates()

# Separate features and labels
X = df.drop(columns=label_columns)
y = df[label_columns]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Save the fitted scaler
with open('Myscaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Print initial label relationships
print("\nAnalyzing initial label relationships...")
analyze_label_relationships(y, "Initial Label Relationships")

# Apply oversampling
print("\nPerforming controlled oversampling...")
X_resampled, y_resampled = oversample_multilabel(X_scaled, y)

y_resampled.to_csv('Y_resampled_no_crix.csv', index=False)
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
X_resampled_df.to_csv('X_resampled_no_crix.csv', index=False)


# Use train_test_split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, 
    y_resampled,
    test_size=0.2,
    random_state=42,
    stratify=create_label_combinations(y_resampled)
)
 # Print split sizes
print(f"\nFinal dataset sizes:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")



# Print statistics
print_label_statistics(y, "Original Dataset Statistics:")
print_label_statistics(y_resampled, "After Oversampling Statistics:")


Analyzing initial label relationships...

Initial Label Relationships

Label correlation p-values (lower = stronger relationship):
                   Brain Class Data Class God Class Schizofrenic Class
Brain Class                NaN        0.0  0.000676                0.0
Data Class                 0.0        NaN       0.0                0.0
God Class             0.000676        0.0       NaN                0.0
Schizofrenic Class         0.0        0.0       0.0                NaN

Performing controlled oversampling...

USING SMOTE





Final dataset sizes:
Training set: 577469 samples
Test set: 144368 samples

Original Dataset Statistics:

Value counts for Brain Class:
Brain Class
False    238482
True        834
Name: count, dtype: int64

Value counts for Data Class:
Data Class
False    220930
True      18386
Name: count, dtype: int64

Value counts for God Class:
God Class
False    235780
True       3536
Name: count, dtype: int64

Value counts for Schizofrenic Class:
Schizofrenic Class
False    219993
True      19323
Name: count, dtype: int64

Imbalance Ratios:
Brain Class: 1:285.95
Data Class: 1:12.02
God Class: 1:66.68
Schizofrenic Class: 1:11.39

After Oversampling Statistics:

Value counts for Brain Class:
Brain Class
0    590232
1    131605
Name: count, dtype: int64

Value counts for Data Class:
Data Class
0    546363
1    175474
Name: count, dtype: int64

Value counts for God Class:
God Class
0    586244
1    135593
Name: count, dtype: int64

Value counts for Schizofrenic Class:
Schizofrenic Class
1    366900


**Understanding  label relationships more**

In [8]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency

def analyze_label_relationships(y, title="Label Relationships"):
    """Analyze and print relationships between labels"""
    print(f"\n{title}")
    n_labels = y.shape[1]
    relationships = pd.DataFrame(index=y.columns, columns=y.columns)
    
    # Calculate correlations for comparison
    correlations = y.corr()
    print("\nLabel Correlations:")
    print(correlations)
    
    for i in range(n_labels):
        for j in range(i+1, n_labels):
            # Create contingency table
            contingency = pd.crosstab(y.iloc[:,i], y.iloc[:,j])
            # Calculate chi-square test
            chi2, p_value, _, _ = chi2_contingency(contingency)
            relationships.iloc[i,j] = p_value
            relationships.iloc[j,i] = p_value
    
    print("\nLabel correlation p-values (lower = stronger relationship):")
    print(relationships)
    return relationships, correlations

def create_label_combinations(y):
    """Convert multi-label data into unique combination patterns"""
    return np.array([''.join(map(str, row)) for row in y.astype(int).values])

def measure_relationship_preservation(original_y, resampled_y):
    """
    Quantify how well label relationships are preserved
    Returns a score between 0 and 1 (1 = perfect preservation)
    """
    orig_corr = original_y.corr().abs()
    new_corr = resampled_y.corr().abs()
    
    # Calculate difference in correlation matrices
    diff = np.abs(orig_corr - new_corr)
    
    # Return preservation score
    preservation_score = 1 - (diff.sum().sum() / (orig_corr.shape[0] * orig_corr.shape[1]))
    print(f"\nRelationship Preservation Score: {preservation_score:.4f}")
    return preservation_score

def validate_label_preservation(original_y, resampled_y, threshold=0.05, correlation_threshold=0.1):
    """
    Validate that label relationships are preserved after resampling
    Returns True if relationships are preserved
    """
    orig_relationships, orig_corr = analyze_label_relationships(original_y, "Original Relationships")
    new_relationships, new_corr = analyze_label_relationships(resampled_y, "Resampled Relationships")
    
    # Compare relationship strengths
    relationship_preserved = True
    warnings = []
    
    for i in range(len(original_y.columns)):
        for j in range(i+1, len(original_y.columns)):
            # Check statistical significance changes
            orig_sig = orig_relationships.iloc[i,j] < threshold
            new_sig = new_relationships.iloc[i,j] < threshold
            
            # Check correlation changes
            corr_change = abs(orig_corr.iloc[i,j] - new_corr.iloc[i,j])
            
            if orig_sig != new_sig or corr_change > correlation_threshold:
                warning = f"Warning: Relationship changed between {original_y.columns[i]} and {original_y.columns[j]}"
                warning += f"\nCorrelation change: {corr_change:.4f}"
                warnings.append(warning)
                relationship_preserved = False
    
    if warnings:
        print("\n" + "\n".join(warnings))
    
    # Calculate and print preservation score
    preservation_score = measure_relationship_preservation(original_y, resampled_y)
    
    return relationship_preserved, preservation_score

def improved_oversample_multilabel(X, y, max_ratio=10, min_samples_smote=5, random_state=42):
    """
    Enhanced oversampling for multi-label data with controlled ratios
    and better relationship preservation
    """
    # Convert labels to label combination patterns
    label_combinations = create_label_combinations(y)
    combination_counts = Counter(label_combinations)
    
    # Calculate target counts while ensuring integers
    median_count = int(np.median(list(combination_counts.values())))
    max_count = max(combination_counts.values())
    
    # Ensure we don't sample less than original counts and use integers
    sampling_strategy = {
        k: int(max(
            v,  # Keep at least original count
            min(
                median_count,  # Target median
                int(max_ratio * v)  # Cap at max_ratio times original
            )
        ))
        for k, v in combination_counts.items()
    }
    
    # Print sampling strategy for debugging
    print("\nInitial sampling strategy:")
    for k, v in sampling_strategy.items():
        print(f"Combination {k}: {v} samples (original: {combination_counts[k]})")
    
    # Progressive oversampling
    X_current, y_current = X.copy(), label_combinations.copy()
    
    # Calculate stages based on actual counts
    min_target = min(sampling_strategy.values())
    max_target = max(sampling_strategy.values())
    stages = np.linspace(min_target, max_target, num=5)
    
    for stage_idx, target_size in enumerate(stages):
        print(f"\nOversampling stage {stage_idx + 1}/{len(stages)}")
        
        # Ensure integer values in current strategy
        current_strategy = {
            k: int(max(v, min(sampling_strategy[k], target_size))) 
            for k, v in combination_counts.items()
        }
        
        # Print current stage strategy for debugging
        print("\nCurrent stage strategy:")
        for k, v in current_strategy.items():
            print(f"Combination {k}: {v} samples")
        
        # Determine if we can use SMOTE
        min_samples = min(Counter(y_current).values())
        
        if min_samples >= min_samples_smote:
            try:
                oversample = SMOTE(
                    random_state=random_state,
                    k_neighbors=min(min_samples-1, 5),
                    sampling_strategy=current_strategy
                )
                X_current, y_current = oversample.fit_resample(X_current, y_current)
                print("Used SMOTE for this stage")
            except (ValueError, RuntimeError) as e:
                print(f"SMOTE failed: {str(e)}")
                print("Falling back to RandomOverSampler")
                oversample = RandomOverSampler(
                    random_state=random_state,
                    sampling_strategy=current_strategy
                )
                X_current, y_current = oversample.fit_resample(X_current, y_current)
        else:
            print("Using RandomOverSampler due to insufficient samples")
            oversample = RandomOverSampler(
                random_state=random_state,
                sampling_strategy=current_strategy
            )
            X_current, y_current = oversample.fit_resample(X_current, y_current)
        
        # Convert current state back to original format for relationship validation
        y_temp = pd.DataFrame([list(map(int, combo)) for combo in y_current],
                            columns=y.columns)
        print(f"Current samples: {len(y_current)}")
        _, preservation_score = validate_label_preservation(y, y_temp)
        
        if preservation_score < 0.7:
            print(f"Warning: Low preservation score ({preservation_score:.4f}) at stage {stage_idx + 1}")
    
    # Convert final result back to original format
    y_resampled = pd.DataFrame([list(map(int, combo)) for combo in y_current],
                              columns=y.columns)
    
    return X_current, y_resampled

def print_label_statistics(y, title):
    """Print detailed statistics about label distributions"""
    print(f"\n{title}")
    
    # Individual label statistics
    for column in y.columns:
        counts = y[column].value_counts()
        print(f"\nValue counts for {column}:")
        print(counts)
        
        ratio = counts.max() / counts.min()
        percentage = (counts[1] if 1 in counts else 0) / len(y) * 100
        
        print(f"Imbalance Ratio: 1:{ratio:.2f}")
        print(f"Positive class percentage: {percentage:.2f}%")
    
    # Co-occurrence statistics
    print("\nLabel co-occurrence counts:")
    for i, col1 in enumerate(y.columns):
        for j, col2 in enumerate(y.columns[i+1:], i+1):
            co_occurrence = ((y[col1] == 1) & (y[col2] == 1)).sum()
            total = len(y)
            print(f"{col1} + {col2}: {co_occurrence} ({(co_occurrence/total)*100:.2f}%)")

df = pd.read_csv('Data/New-Class-smell.csv')
df = df.drop(columns=['Address', 'Hierarchy Duplication', 
                         'Futile Abstract Pipeline', 'Futile Hierarchy'])

    # Define label columns
label_columns = [
        'Brain Class', 'Data Class', 
        'God Class', 'Schizofrenic Class', 'Model Class'
    ]

    # Remove duplicates
df = df.drop_duplicates()

    # Separate features and labels
X = df.drop(columns=label_columns)
y = df[label_columns]

    # Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

    # Print initial statistics and relationships
# Print combination counts before oversampling
initial_combinations = create_label_combinations(y)
initial_counts = Counter(initial_combinations)
print("\nInitial combination counts:")
for combo, count in initial_counts.most_common():
    print(f"Combination {combo}: {count}")

    # Apply improved oversampling
print("\nApplying improved oversampling...")
X_resampled, y_resampled = improved_oversample_multilabel(
        X_scaled, 
        y,
        max_ratio=5,
        min_samples_smote=5,
        random_state=42
    )

    # Print final statistics
print("\nAnalyzing final dataset...")
print_label_statistics(y_resampled, "Final Dataset Statistics")

    # Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
        X_resampled, 
        y_resampled,
        test_size=0.2,
        random_state=42,
        stratify=create_label_combinations(y_resampled)
    )

print("\nTrain/Test split sizes:")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

   


Initial combination counts:
Combination 00001: 199085
Combination 01001: 17443
Combination 00011: 17145
Combination 00101: 2504
Combination 00111: 1032
Combination 01011: 942
Combination 10001: 641
Combination 00000: 363
Combination 10011: 193
Combination 00010: 11
Combination 01000: 1

Applying improved oversampling...

Initial sampling strategy:
Combination 00001: 199085 samples (original: 199085)
Combination 00011: 17145 samples (original: 17145)
Combination 10011: 942 samples (original: 193)
Combination 00111: 1032 samples (original: 1032)
Combination 00101: 2504 samples (original: 2504)
Combination 01001: 17443 samples (original: 17443)
Combination 01011: 942 samples (original: 942)
Combination 00000: 942 samples (original: 363)
Combination 10001: 942 samples (original: 641)
Combination 00010: 55 samples (original: 11)
Combination 01000: 5 samples (original: 1)

Oversampling stage 1/5

Current stage strategy:
Combination 00001: 199085 samples
Combination 00011: 17145 samples
Comb