In [15]:
import pandas as pd
from sklearn.utils import resample

# Load the dataset
file_path = "classified_extracted_data.csv"  # Change to your actual file path
df = pd.read_csv(file_path)

# Ensure 'tirads' column is treated as a string and clean it
df["tirads"] = df["tirads"].astype(str).str.lower().str.strip()

# Replace NaN values with "0"
df["tirads"] = df["tirads"].replace("nan", "0")

# Define classification rules
benign_classes = ["0","1", "2", "3"]
df["classification"] = df["tirads"].apply(lambda x: "benign" if x in benign_classes else "malignant")

# Define target counts to balance but not equalize
target_counts = {
    "0": 92,   # No augmentation
    "2": 80,   # Increase but not equal to 92
    "3": 75,   # Increase but not equal to 92
    "4a": 85,  # No augmentation
    "4b": 80,  # Increase but not equal to 85
    "4c": 75,  # Increase but not equal to 85
    "5": 70    # Increase but not equal to 85
}

# List to store balanced subsets
balanced_data = []

# Process each TIRADS category
for tirads_value, target in target_counts.items():
    subset = df[df["tirads"] == tirads_value]
    
    # If the count is less than the target, oversample without exact duplication
    if len(subset) < target:
        subset = resample(subset, replace=True, n_samples=target, random_state=42)
    
    # Append balanced data
    balanced_data.append(subset)

# Combine all balanced subsets
balanced_df = pd.concat(balanced_data)

# Recalculate classification counts
balanced_classification_counts = balanced_df["classification"].value_counts()

# Save the balanced dataset
balanced_output_path = "balanced_tirads_data.csv"
balanced_df.to_csv(balanced_output_path, index=False)

# Print the new balanced counts
print("Balanced TIRADS Counts:")
print(balanced_df["tirads"].value_counts().sort_index())

print("\nBalanced Classification Results:")
for classification, count in balanced_classification_counts.items():
    print(f"{classification.capitalize()}: {count} cases")

print(f"\nBalanced dataset saved to: {balanced_output_path}")


Balanced TIRADS Counts:
0     92
2     80
3     75
4a    85
4b    80
4c    75
5     70
Name: tirads, dtype: int64

Balanced Classification Results:
Malignant: 310 cases
Benign: 247 cases

Balanced dataset saved to: balanced_tirads_data.csv
