In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the dataset
print("Loading dataset...")
df = pd.read_csv('data.csv')
print(f"Original dataset shape: {df.shape}")
print(f"Total rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")

Loading dataset...
Original dataset shape: (4000, 34)
Total rows: 4000
Total columns: 34


In [7]:
df.head(1).to_string()

'   age gender   bmi smoking_status alcohol_consumption exercise_level   diet_type sun_exposure income_level latitude_region  vitamin_a_percent_rda  vitamin_c_percent_rda  vitamin_d_percent_rda  vitamin_e_percent_rda  vitamin_b12_percent_rda  folate_percent_rda  calcium_percent_rda  iron_percent_rda  hemoglobin_g_dl  serum_vitamin_d_ng_ml  serum_vitamin_b12_pg_ml  serum_folate_ng_ml  symptoms_count symptoms_list  has_night_blindness  has_fatigue  has_bleeding_gums  has_bone_pain  has_muscle_weakness  has_numbness_tingling  has_memory_problems  has_pale_skin disease_diagnosis  has_multiple_deficiencies\n0   79   Male  24.8         Former                 NaN         Active  Vegetarian         High         High             Mid                  119.1                  147.3                 152.88                   97.5                    102.5               188.9                108.3              97.4             13.5                   39.3                    356.4                23.3      

In [3]:
pd.read_csv('data.csv')['diet_type'].value_counts()

diet_type
Vegan          1017
Vegetarian      999
Omnivore        993
Pescatarian     991
Name: count, dtype: int64

In [None]:
pd.read_csv('data.csv')['disease_diagnosis'].value_counts()

disease_diagnosis
Healthy                 1509
Anemia                  1245
Rickets_Osteomalacia    1029
Night_Blindness          122
Scurvy                    95
Name: count, dtype: int64

In [4]:
# Check for duplicates
print("\n=== Checking for Duplicates ===")
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

if duplicate_count > 0:
    print("\nDuplicate rows found:")
    print(df[df.duplicated()].head())
    print(f"\nRemoving {duplicate_count} duplicate rows...")
    df = df.drop_duplicates()
    print(f"Dataset shape after removing duplicates: {df.shape}")
else:
    print("No duplicate rows found.")


=== Checking for Duplicates ===
Number of duplicate rows: 0
No duplicate rows found.


In [5]:
# Analyze whole dataset for median and range for all attributes
print("\n=== Analyzing Dataset for Default Values ===")
print(f"Dataset shape: {df.shape}")

# Separate numeric and categorical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Exclude target column if present
target_col = 'disease_diagnosis'
if target_col in numeric_cols:
    numeric_cols.remove(target_col)
if target_col in categorical_cols:
    categorical_cols.remove(target_col)

print(f"\nNumeric columns: {len(numeric_cols)}")
print(f"Categorical columns: {len(categorical_cols)}")


=== Analyzing Dataset for Default Values ===
Dataset shape: (4000, 34)

Numeric columns: 24
Categorical columns: 9


In [6]:
# Calculate median, min, max, and unique value count for numeric columns
print("\n=== Numeric Attributes - Median, Range, and Unique Values ===")
numeric_stats = []

for col in numeric_cols:
    median_val = df[col].median()
    min_val = df[col].min()
    max_val = df[col].max()
    unique_count = df[col].nunique()
    total_count = len(df[col])
    unique_ratio = unique_count / total_count if total_count > 0 else 0
    
    # Flag as potentially categorical if unique values are few
    is_potentially_categorical = unique_count <= 10 or unique_ratio < 0.1
    category_flag = " (CATEGORICAL?)" if is_potentially_categorical else ""
    
    numeric_stats.append({
        'Attribute': col,
        'Median': median_val,
        'Min': min_val,
        'Max': max_val,
        'Unique Values': unique_count,
        'Total Values': total_count,
        'Unique Ratio': unique_ratio,
        'Potentially Categorical': is_potentially_categorical,
        'Range': f"[{min_val:.2f}, {max_val:.2f}]"
    })
    print(f"{col:30s} | Median: {median_val:10.2f} | Range: [{min_val:8.2f}, {max_val:8.2f}] | Unique: {unique_count:4d} ({unique_ratio*100:5.1f}%){category_flag}")

numeric_stats_df = pd.DataFrame(numeric_stats)
print("\nNumeric Statistics DataFrame:")
print(numeric_stats_df.to_string(index=False))


=== Numeric Attributes - Median, Range, and Unique Values ===
age                            | Median:      51.00 | Range: [   18.00,    84.00] | Unique:   67 (  1.7%) (CATEGORICAL?)
bmi                            | Median:      26.20 | Range: [   15.00,    45.00] | Unique:  251 (  6.3%) (CATEGORICAL?)
vitamin_a_percent_rda          | Median:      85.50 | Range: [   10.00,   219.00] | Unique: 1364 ( 34.1%)
vitamin_c_percent_rda          | Median:      83.50 | Range: [   10.00,   250.00] | Unique: 1381 ( 34.5%)
vitamin_d_percent_rda          | Median:      62.27 | Range: [    7.00,   275.60] | Unique: 2312 ( 57.8%)
vitamin_e_percent_rda          | Median:      84.05 | Range: [   10.00,   237.60] | Unique: 1373 ( 34.3%)
vitamin_b12_percent_rda        | Median:      55.60 | Range: [   10.00,   243.60] | Unique: 1273 ( 31.8%)
folate_percent_rda             | Median:      84.80 | Range: [   10.00,   226.60] | Unique: 1399 ( 35.0%)
calcium_percent_rda            | Median:      77.10 | Range

In [7]:
# Identify fake numeric columns (numeric but actually categorical)
print("\n=== Identifying Fake Numeric Columns (Actually Categorical) ===")
fake_numeric_cols = numeric_stats_df[numeric_stats_df['Potentially Categorical'] == True]['Attribute'].tolist()

if fake_numeric_cols:
    print(f"\nFound {len(fake_numeric_cols)} potentially categorical numeric columns:")
    for col in fake_numeric_cols:
        unique_vals = sorted(df[col].unique().tolist())
        print(f"\n{col}:")
        print(f"  Unique values ({len(unique_vals)}): {unique_vals}")
        print(f"  Value counts:")
        value_counts = df[col].value_counts().sort_index()
        for val, count in value_counts.items():
            print(f"    {val}: {count}")
else:
    print("No fake numeric columns found (all numeric columns have many unique values).")

# Get mode (most common value) for categorical columns
print("\n=== Categorical Attributes - Mode (Most Common Value) ===")
categorical_stats = []

for col in categorical_cols:
    mode_val = df[col].mode()[0] if len(df[col].mode()) > 0 else df[col].iloc[0]
    unique_vals = df[col].unique().tolist()
    categorical_stats.append({
        'Attribute': col,
        'Mode (Default)': mode_val,
        'Unique Values': unique_vals,
        'Count': len(unique_vals)
    })
    print(f"{col:30s} | Mode: {str(mode_val):20s} | Unique Values: {len(unique_vals)}")

categorical_stats_df = pd.DataFrame(categorical_stats)
print("\nCategorical Statistics DataFrame:")
print(categorical_stats_df.to_string(index=False))


=== Identifying Fake Numeric Columns (Actually Categorical) ===

Found 14 potentially categorical numeric columns:

age:
  Unique values (67): [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84]
  Value counts:
    18: 79
    19: 68
    20: 51
    21: 62
    22: 50
    23: 61
    24: 60
    25: 59
    26: 52
    27: 56
    28: 54
    29: 63
    30: 73
    31: 55
    32: 69
    33: 53
    34: 59
    35: 47
    36: 63
    37: 59
    38: 51
    39: 66
    40: 63
    41: 59
    42: 53
    43: 70
    44: 52
    45: 67
    46: 71
    47: 63
    48: 61
    49: 66
    50: 60
    51: 58
    52: 61
    53: 55
    54: 63
    55: 61
    56: 55
    57: 73
    58: 60
    59: 75
    60: 62
    61: 50
    62: 58
    63: 64
    64: 56
    65: 52
    66: 60
    67: 55
    68: 57
    69: 56

In [None]:
# Model Hyperparameter Tuning with Iterative GridSearchCV Refinement
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import warnings
import json
warnings.filterwarnings('ignore')

# Load and preprocess data (same as train_models.py)
df = pd.read_csv('train_valid.csv')
print(f"Loaded {len(df)} training samples for hyperparameter tuning")

numeric_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(include=['object']).columns
target = 'disease_diagnosis'

# Convert fake numeric columns to categorical
pseudo_numeric = []
for col in numeric_cols:
    if col != target:
        if df[col].nunique() <= 10:
            pseudo_numeric.append(col)

for col in pseudo_numeric:
    df[col] = df[col].astype(str)

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numeric_cols = [c for c in numeric_cols if c != target]
categorical_cols = [c for c in categorical_cols if c != target]

df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
df[categorical_cols] = df[categorical_cols].fillna('Unknown')

X = df.drop(target, axis=1)
y_raw = df[target]

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_raw)

X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

X_train, X_val, y_train, y_val = train_test_split(
    X_encoded, y_encoded, test_size=0.2, random_state=2222, stratify=y_raw
)

feature_scaler = MinMaxScaler()
X_train_scaled = feature_scaler.fit_transform(X_train)
X_val_scaled = feature_scaler.transform(X_val)

print(f"Training set: {X_train_scaled.shape}, Validation set: {X_val_scaled.shape}")
print(f"Number of classes: {len(np.unique(y_train))}")
print("\nStarting iterative hyperparameter tuning...\n")

# Initial parameter grids for each model (optimized based on previous tuning results)
initial_param_grids = {
    "Logistic Regression": {
        'C': [50, 100, 200, 500],
        'solver': ['lbfgs', 'liblinear'],
        'max_iter': [500, 1000, 2000]
    },
    "Decision Tree": {
        'max_depth': [10, 15, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "KNN": {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    "Naive Bayes (Gaussian)": {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]
    },
    "Naive Bayes (Multinomial)": {
        'alpha': [1.0, 2.0, 3.0, 4.0]
    },
    "Random Forest": {
        'n_estimators': [100, 200, 400],
        'max_depth': [20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    "XGBoost": {
        'n_estimators': [25, 50, 100],
        'max_depth': [2, 3, 5],
        'learning_rate': [0.1, 0.2, 0.3],
        'subsample': [0.7, 0.8, 1.0]
    }
}

# Base models
base_models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes (Gaussian)": GaussianNB(),
    "Naive Bayes (Multinomial)": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42)
}

def expand_param_grid(param_name, current_value, current_grid, original_length, param_type='numeric'):
    """Expand parameter grid on one side while reducing on the other to maintain length"""
    if param_type == 'numeric':
        sorted_grid = sorted([x for x in current_grid if x is not None])
        if not sorted_grid:
            return list(current_grid)
            
        min_val = sorted_grid[0]
        max_val = sorted_grid[-1]
        original_len = original_length
        
        # Check if at minimum boundary
        if current_value == min_val:
            if isinstance(min_val, (int, float)):
                if min_val > 0:
                    # Add smaller values
                    if min_val < 1:
                        new_vals = [min_val / 10, min_val / 5, min_val / 2]
                    else:
                        new_vals = [min_val / 2, min_val - 1] if min_val > 1 else [min_val / 2]
                    
                    # Combine and sort
                    expanded = sorted(set(new_vals + sorted_grid))
                    
                    # Remove values from the high end (opposite side) to maintain length
                    if len(expanded) > original_len:
                        # Keep the smallest values, ensuring best value is included
                        expanded_sorted = sorted(expanded)
                        # Take first original_len values (which includes the best value at min)
                        expanded = expanded_sorted[:original_len]
                    
                    # Ensure best value is included
                    if current_value not in expanded:
                        # Remove the largest value and add the best value
                        expanded = sorted(expanded)[:-1] + [current_value]
                        expanded = sorted(expanded)
                    
                    # Handle None values if they were in original
                    if None in current_grid:
                        expanded.append(None)
                    
                    return sorted([x for x in expanded if x is not None]) + ([None] if None in current_grid else [])
        
        # Check if at maximum boundary
        if current_value == max_val and max_val is not None:
            if isinstance(max_val, (int, float)):
                if max_val < 1:
                    new_vals = [max_val * 2, max_val * 5, max_val * 10]
                else:
                    new_vals = [max_val * 2, max_val * 5] if max_val >= 10 else [max_val * 2]
                
                # Combine and sort
                expanded = sorted(set(sorted_grid + new_vals))
                
                # Remove values from the low end (opposite side) to maintain length
                if len(expanded) > original_len:
                    # Keep the largest values, ensuring best value is included
                    expanded_sorted = sorted(expanded)
                    # Take last original_len values (which includes the best value at max)
                    expanded = expanded_sorted[-original_len:]
                
                # Ensure best value is included
                if current_value not in expanded:
                    # Remove the smallest value and add the best value
                    expanded = sorted(expanded)[1:] + [current_value]
                    expanded = sorted(expanded)
                
                # Handle None values if they were in original
                if None in current_grid:
                    expanded.append(None)
                
                return sorted([x for x in expanded if x is not None]) + ([None] if None in current_grid else [])
    
    return list(current_grid)

# Perform iterative GridSearchCV for each model
best_params = {}
tuning_results = []
max_iterations = 10  # Maximum refinement iterations

for model_name, base_model in base_models.items():
    print(f"\n{'='*70}")
    print(f"Tuning {model_name} with iterative refinement...")
    print(f"{'='*70}")
    
    current_grid = initial_param_grids[model_name].copy()
    # Store original lengths for each parameter
    original_lengths = {param: len(values) for param, values in current_grid.items()}
    
    iteration_best_params = None
    iteration_best_score = 0
    final_iteration = 1
    previous_params = None  # Track previous best params to detect oscillation
    no_improvement_count = 0  # Count iterations without improvement
    
    for iteration in range(max_iterations):
        print(f"\n--- Iteration {iteration + 1} ---")
        print(f"Parameter grid: {current_grid}")
        
        grid_search = GridSearchCV(
            base_model,
            current_grid,
            cv=5,
            scoring='accuracy',
            n_jobs=-1,
            verbose=0
        )
        
        grid_search.fit(X_train_scaled, y_train)
        
        current_best_params = grid_search.best_params_
        current_best_score = grid_search.best_score_
        
        print(f"Best CV Score: {current_best_score:.4f}")
        print(f"Best Parameters: {current_best_params}")
        
        # Check if score improved
        score_improved = current_best_score > iteration_best_score + 0.0001  # Small threshold to avoid floating point issues
        
        # Check if we should refine further
        should_refine = False
        refined_grid = {}
        
        for param_name, best_value in current_best_params.items():
            if param_name in current_grid:
                param_values = current_grid[param_name]
                original_len = original_lengths[param_name]
                
                if isinstance(best_value, (int, float)):
                    numeric_vals = [v for v in param_values if v is not None and isinstance(v, (int, float))]
                    if numeric_vals:
                        sorted_vals = sorted(numeric_vals)
                        is_at_boundary = (best_value == sorted_vals[0]) or (best_value == sorted_vals[-1])
                        
                        # Only refine if at boundary AND score improved OR first iteration
                        if is_at_boundary and iteration < max_iterations - 1 and (score_improved or iteration == 0):
                            should_refine = True
                            refined_grid[param_name] = expand_param_grid(
                                param_name, best_value, param_values, original_len
                            )
                            print(f"    Refined {param_name}: {len(param_values)} -> {len(refined_grid[param_name])} values")
                        else:
                            refined_grid[param_name] = param_values
                    else:
                        refined_grid[param_name] = param_values
                else:
                    refined_grid[param_name] = param_values
        
        # Update best if improved
        if current_best_score > iteration_best_score:
            iteration_best_params = current_best_params.copy()
            iteration_best_score = current_best_score
            no_improvement_count = 0
        else:
            no_improvement_count += 1
        
        # Check for oscillation (same params as previous iteration)
        if previous_params == current_best_params:
            print(f"\nOscillation detected - same parameters as previous iteration")
            print(f"Final best parameters after {final_iteration} iteration(s)")
            break
        
        previous_params = current_best_params.copy()
        final_iteration = iteration + 1
        
        # Break if no improvement for 2 iterations or no refinement needed or last iteration
        if no_improvement_count >= 2 or not should_refine or iteration == max_iterations - 1:
            if no_improvement_count >= 2:
                print(f"\nNo improvement for 2 iterations - stopping refinement")
            print(f"Final best parameters after {final_iteration} iteration(s)")
            break
        
        # Update grid for next iteration
        current_grid = refined_grid
    
    best_params[model_name] = iteration_best_params
    
    # Evaluate final model on validation set
    final_model = base_model.set_params(**iteration_best_params)
    final_model.fit(X_train_scaled, y_train)
    y_pred = final_model.predict(X_val_scaled)
    val_accuracy = accuracy_score(y_val, y_pred)
    
    print(f"\nFinal Validation Accuracy: {val_accuracy:.4f}")
    
    tuning_results.append({
        'Model': model_name,
        'Best CV Score': round(iteration_best_score, 4),
        'Validation Accuracy': round(val_accuracy, 4),
        'Best Parameters': str(iteration_best_params),
        'Iterations': final_iteration
    })

# Display summary
print("\n" + "="*70)
print("ITERATIVE HYPERPARAMETER TUNING SUMMARY")
print("="*70)
tuning_df = pd.DataFrame(tuning_results)
print(tuning_df.to_string(index=False))

# Save best parameters
with open('best_hyperparameters.json', 'w') as f:
    json.dump(best_params, f, indent=2)
print("\n\nBest hyperparameters saved to 'best_hyperparameters.json'")
print("\nFinal best parameters:")
for model, params in best_params.items():
    print(f"\n{model}:")
    for param, value in params.items():
        print(f"  {param}: {value}")

Loaded 3600 training samples for hyperparameter tuning
Training set: (2880, 276), Validation set: (720, 276)
Number of classes: 5

Starting iterative hyperparameter tuning...


Tuning Logistic Regression with iterative refinement...

--- Iteration 1 ---
Parameter grid: {'C': [0.1, 1, 10, 100], 'solver': ['lbfgs', 'liblinear'], 'max_iter': [1000, 2000]}
Best CV Score: 0.9080
Best Parameters: {'C': 100, 'max_iter': 1000, 'solver': 'lbfgs'}
    Refined C: 4 -> 4 values
    Refined max_iter: 2 -> 2 values

--- Iteration 2 ---
Parameter grid: {'C': [10, 100, 200, 500], 'max_iter': [500.0, 1000], 'solver': ['lbfgs', 'liblinear']}
Best CV Score: 0.9090
Best Parameters: {'C': 200, 'max_iter': 1000, 'solver': 'lbfgs'}
    Refined max_iter: 2 -> 2 values

--- Iteration 3 ---
Parameter grid: {'C': [10, 100, 200, 500], 'max_iter': [1000, 5000], 'solver': ['lbfgs', 'liblinear']}
Best CV Score: 0.9090
Best Parameters: {'C': 200, 'max_iter': 1000, 'solver': 'lbfgs'}
    Refined max_iter: 2 -> 2 value

In [24]:
# Create a comprehensive summary dictionary for default values
print("\n=== Default Values Summary ===")
default_values = {}

# Add numeric defaults (median) - but treat fake numeric as categorical
for col in numeric_cols:
    unique_count = df[col].nunique()
    is_fake_numeric = unique_count <= 10
    
    if is_fake_numeric:
        # Treat as categorical - use mode and list unique values
        mode_val = df[col].mode()[0] if len(df[col].mode()) > 0 else df[col].iloc[0]
        unique_vals = sorted(df[col].unique().tolist())
        default_values[col] = {
            'default': float(mode_val),
            'options': [float(v) for v in unique_vals],
            'type': 'fake_numeric_categorical',
            'min': float(df[col].min()),
            'max': float(df[col].max())
        }
    else:
        # True numeric - use median
        default_values[col] = {
            'default': float(df[col].median()),
            'min': float(df[col].min()),
            'max': float(df[col].max()),
            'type': 'numeric'
        }

# Add categorical defaults (mode)
for col in categorical_cols:
    mode_val = df[col].mode()[0] if len(df[col].mode()) > 0 else df[col].iloc[0]
    unique_vals = df[col].unique().tolist()
    default_values[col] = {
        'default': mode_val,
        'options': unique_vals,
        'type': 'categorical'
    }

# Display summary
print("\nDefault Values Dictionary:")
for attr, values in default_values.items():
    if 'options' in values:
        if values.get('type') == 'fake_numeric_categorical':
            print(f"{attr}: default={values['default']:.0f}, options={values['options']} (FAKE NUMERIC - CATEGORICAL)")
        else:
            print(f"{attr}: default='{values['default']}', options={values['options']}")
    else:
        print(f"{attr}: default={values['default']:.2f}, range=[{values['min']:.2f}, {values['max']:.2f}]")

# Save to a JSON file for easy access
import json
with open('default_values.json', 'w') as f:
    json.dump(default_values, f, indent=2, default=str)
print("\nDefault values saved to 'default_values.json'")


=== Default Values Summary ===

Default Values Dictionary:
age: default=51.00, range=[18.00, 84.00]
bmi: default=26.20, range=[15.00, 45.00]
vitamin_a_percent_rda: default=85.50, range=[10.00, 219.00]
vitamin_c_percent_rda: default=83.50, range=[10.00, 250.00]
vitamin_d_percent_rda: default=62.27, range=[7.00, 275.60]
vitamin_e_percent_rda: default=84.05, range=[10.00, 237.60]
vitamin_b12_percent_rda: default=55.60, range=[10.00, 243.60]
folate_percent_rda: default=84.80, range=[10.00, 226.60]
calcium_percent_rda: default=77.10, range=[10.00, 232.70]
iron_percent_rda: default=71.25, range=[10.00, 211.40]
hemoglobin_g_dl: default=14.10, range=[8.20, 18.00]
serum_vitamin_d_ng_ml: default=18.40, range=[5.00, 80.00]
serum_vitamin_b12_pg_ml: default=214.85, range=[100.00, 1138.10]
serum_folate_ng_ml: default=10.00, range=[2.00, 25.00]
symptoms_count: default=0, options=[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] (FAKE NUMERIC - CATEGORICAL)
has_night_blindness: default=0, options=[0

In [25]:
# Display formatted summary table
print("\n=== Complete Summary Table ===")
summary_data = []

for col in numeric_cols:
    unique_count = df[col].nunique()
    is_fake_numeric = unique_count <= 10
    type_label = 'Fake Numeric (Categorical)' if is_fake_numeric else 'Numeric'
    
    if is_fake_numeric:
        mode_val = df[col].mode()[0] if len(df[col].mode()) > 0 else df[col].iloc[0]
        unique_vals = sorted(df[col].unique().tolist())
        summary_data.append({
            'Attribute': col,
            'Type': type_label,
            'Default (Mode)': f"{mode_val:.0f}",
            'Unique Values': f"{unique_count} ({', '.join([str(int(v)) for v in unique_vals])})",
            'Range': f"[{df[col].min():.0f}, {df[col].max():.0f}]"
        })
    else:
        summary_data.append({
            'Attribute': col,
            'Type': type_label,
            'Default (Median)': f"{df[col].median():.2f}",
            'Unique Values': f"{unique_count}",
            'Range': f"[{df[col].min():.2f}, {df[col].max():.2f}]"
        })

for col in categorical_cols:
    mode_val = df[col].mode()[0] if len(df[col].mode()) > 0 else df[col].iloc[0]
    unique_vals = df[col].unique().tolist()
    summary_data.append({
        'Attribute': col,
        'Type': 'Categorical',
        'Default (Mode)': str(mode_val),
        'Unique Values': f"{len(unique_vals)} ({', '.join([str(v) for v in unique_vals[:5]])}{'...' if len(unique_vals) > 5 else ''})",
        'Range': 'N/A'
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))


=== Complete Summary Table ===
                Attribute                       Type Default (Median)                                                                                                                    Unique Values             Range                    Default (Mode)
                      age                    Numeric            51.00                                                                                                                               67    [18.00, 84.00]                               NaN
                      bmi                    Numeric            26.20                                                                                                                              251    [15.00, 45.00]                               NaN
    vitamin_a_percent_rda                    Numeric            85.50                                                                                                                             1364   [10.00, 219.00