In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency

def analyze_label_relationships(y, title="Label Relationships"):
    """Analyze and print relationships between labels"""
    print(f"\n{title}")
    n_labels = y.shape[1]
    relationships = pd.DataFrame(index=y.columns, columns=y.columns)
    
    for i in range(n_labels):
        for j in range(i+1, n_labels):
            # Create contingency table
            contingency = pd.crosstab(y.iloc[:,i], y.iloc[:,j])
            # Calculate chi-square test
            chi2, p_value, _, _ = chi2_contingency(contingency)
            relationships.iloc[i,j] = p_value
            relationships.iloc[j,i] = p_value
    
    print("\nLabel correlation p-values (lower = stronger relationship):")
    print(relationships)
    return relationships

def print_label_statistics(y, title):
    print(f"\n{title}")
    for column in y.columns:
        print(f"\nValue counts for {column}:")
        print(y[column].value_counts())
            
    print("\nImbalance Ratios:")
    for column in y.columns:
        counts = y[column].value_counts()
        ratio = counts.max() / counts.min()
        print(f"{column}: 1:{ratio:.2f}")

def validate_label_preservation(original_y, resampled_y, threshold=0.05):
    """
    Validate that label relationships are preserved after resampling
    Returns True if relationships are preserved
    """
    orig_relationships = analyze_label_relationships(original_y, "Original Relationships")
    new_relationships = analyze_label_relationships(resampled_y, "Resampled Relationships")
    
    # Compare relationship strengths
    relationship_preserved = True
    for i in range(len(original_y.columns)):
        for j in range(i+1, len(original_y.columns)):
            orig_sig = orig_relationships.iloc[i,j] < threshold
            new_sig = new_relationships.iloc[i,j] < threshold
            if orig_sig != new_sig:
                print(f"\nWarning: Relationship changed between {original_y.columns[i]} and {original_y.columns[j]}")
                relationship_preserved = False
    
    return relationship_preserved

def get_target_ratio(class_name, minority_count, majority_count):
    """
    Calculate target ratio based on class-specific characteristics
    Returns target ratio and whether to use aggressive sampling
    """
    current_ratio = minority_count / majority_count
    
    # Class-specific ratio targets based on domain knowledge and original imbalance
    ratio_targets = {
        'Brain Class': 0.15,     
        'Data Class': 0.4,        
        'God Class': 0.2,        
        'Schizofrenic Class': 0.4,
        'Model Class': 0.15       
    }
    
   
    target = ratio_targets.get(class_name, 0.3)  # Default 0.3 if class not found
    
   
    use_aggressive = current_ratio < 0.01  # For extremely imbalanced cases
    
    return target, use_aggressive

def create_label_combinations(y):
   
    return np.array([''.join(map(str, row)) for row in y.astype(int).values])

def oversample_multilabel(X, y, random_state=42):
  
   
    label_combinations = create_label_combinations(y)
    combination_counts = Counter(label_combinations)
    
 
    sampling_strategy = {}
    max_count = max(combination_counts.values())
    
  
    for combo, count in combination_counts.items():
        combo_labels = list(map(int, combo))
        
      
        max_target_ratio = 0
        for i, (label, present) in enumerate(zip(y.columns, combo_labels)):
            if present:  
                minority_count = count
                majority_count = max_count
                target_ratio, is_aggressive = get_target_ratio(label, minority_count, majority_count)
                if is_aggressive:
                    target_ratio *= 1.2  
                max_target_ratio = max(max_target_ratio, target_ratio)
        
        # Calculate target number of samples
        target = int(max_count * max_target_ratio)
        if target > count:  
            sampling_strategy[combo] = target
    
    min_samples = min(combination_counts.values())
    
    if min_samples >= 5:
        oversample = SMOTE(
            random_state=random_state,
            k_neighbors=min(min_samples-1, 5),
            sampling_strategy=sampling_strategy
            
        )
     
    else:
        oversample = RandomOverSampler(
            random_state=random_state,
            sampling_strategy=sampling_strategy
            
        
        )
        
    
    # Perform oversampling
    X_resampled_temp, y_resampled_temp = oversample.fit_resample(X, label_combinations)
    
    # Convert back to original format
    y_resampled = pd.DataFrame([list(map(int, combo)) for combo in y_resampled_temp],
                              columns=y.columns)
    
    return X_resampled_temp, y_resampled
# Load and preprocess data
df = pd.read_csv('Data/New-Class-smell.csv')
df = df.drop(columns=['Address', 'Hierarchy Duplication', 'Futile Abstract Pipeline', 'Futile Hierarchy'])

# Define label columns
label_columns = [
    'Brain Class', 'Data Class', 
    'God Class', 'Schizofrenic Class', 'Model Class'
]

# Remove duplicates
df = df.drop_duplicates()

# Separate features and labels
X = df.drop(columns=label_columns)
y = df[label_columns]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Print initial label relationships
print("\nAnalyzing initial label relationships...")
analyze_label_relationships(y, "Initial Label Relationships")

# Apply oversampling
print("\nPerforming  oversampling...")
X_resampled, y_resampled = oversample_multilabel(X_scaled, y)

y_resampled.to_csv('y_resampled.csv', index=False)
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
X_resampled_df.to_csv('X_resampled.csv', index=False)
# Use train_test_split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, 
    y_resampled,
    test_size=0.2,
    random_state=42,
    stratify=create_label_combinations(y_resampled)
)
 # Print split sizes
print(f"\nFinal dataset sizes:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")



# Print statistics
print_label_statistics(y, "Original Dataset Statistics:")
print_label_statistics(y_resampled, "After Oversampling Statistics:")


Analyzing initial label relationships...

Initial Label Relationships

Label correlation p-values (lower = stronger relationship):
                   Brain Class Data Class God Class Schizofrenic Class  \
Brain Class                NaN        0.0  0.000677                0.0   
Data Class                 0.0        NaN       0.0                0.0   
God Class             0.000677        0.0       NaN                0.0   
Schizofrenic Class         0.0        0.0       0.0                NaN   
Model Class           0.479294        0.0  0.030856           0.000369   

                   Model Class  
Brain Class           0.479294  
Data Class                 0.0  
God Class             0.030856  
Schizofrenic Class    0.000369  
Model Class                NaN  

Performing controlled oversampling...

Final dataset sizes:
Training set: 729734 samples
Test set: 182434 samples

Original Dataset Statistics:

Value counts for Brain Class:
Brain Class
False    238526
True        834
Name: