# Backpack Price Category Analysis

This notebook demonstrates how to perform exploratory data analysis, handle missing values, perform price category binning, and prepare features for supervised classification.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, TargetEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from IPython.display import display
import time
from scipy.stats import spearmanr, chi2_contingency, pointbiserialr

# 1. Data Loading
Load the datasets and take a first look at their structure.

In [2]:
# Load train datasets
train = pd.read_csv('Data/train.csv')
train_extra = pd.read_csv('Data/training_extra.csv')

print(f"Train shape: {train.shape}")
print(f"Training extra shape: {train_extra.shape}")

# Show first few rows for a quick glance at the data
display(train.head())
display(train_extra.head())







# 2. Price Conversion
The original problem is a regression problem, since "Price" is a continous value, our goal is to turn this into a classification problem. To achieve that we first need to divid Price into 5 categories, according to the range of values.

In [3]:
# Divide "Price" into categories
def price_classification(df, bins):
    min_price = np.floor(df['Price'].min())
    max_price = np.ceil(df['Price'].max())

    bin_edges = np.linspace(min_price, max_price, bins + 1)

    print(f"Price Range: {min_price} to {max_price}")
    print(f"Price Bin Edges: {bin_edges}")

    df['Price_Category'] = pd.cut(df['Price'], bins=bin_edges, labels=[f'Class_{i+1}' for i in range(bins)], include_lowest=True)
    return df

# Plot price distribution
def plot_price_distribution(df):
    plt.figure(figsize=(10, 6))

    category_counts = df['Price_Category'].value_counts().sort_index()

    bars = plt.bar(category_counts.index, category_counts.values,
                    color='skyblue', edgecolor='black')

    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2., height,
                    f'{int(height)}',
                    ha='center', va='bottom')

    plt.title('Price Category Distribution')
    plt.xlabel('Price Categories')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()


train = price_classification(train, bins=5)
print("Price Distribution in Train Dataset:")
plot_price_distribution(train)

train_extra = price_classification(train_extra, bins=5)
print("Price Distribution in Training Extra Dataset:")
plot_price_distribution(train_extra)









# 3. Exploratory Data Analysis
Analyze the dataset to understand its structure, missing values, duplicates, feature distributions, and correlation between the different features and the new "Price_Category".

### 3.1. Basic Information

In [None]:
def basic_info(df):
    print("BASIC DATASET INFORMATION")
    print("--------------------------")

    print("\n- First 5 rows:")
    print(df.head())

    print("\n- Data types:")
    print(df.dtypes)


    type_counts = df.dtypes.value_counts()
    print(f"\n- Column type distribution:")
    for dtype, count in type_counts.items():
        print(f"   - {dtype}: {count} columns")

    print("\n- Basic numerical statistics:")
    non_id_cols = [col for col in df.columns if col.lower() != 'id']
    print(df[non_id_cols].describe())


    plt.figure(figsize=(6, 3))
    plt.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%')
    plt.title('Data Types Distribution', fontsize=15)
    plt.tight_layout()
    plt.show()


train_basic_info = basic_info(train)
train_extra_basic_info = basic_info(train_extra)


### 3.2. Missing Values

In [None]:
# Missing Values
def missing_values(df):
    print("\nMISSING VALUES ANALYSIS")
    print("-----------------------")

    missing = df.isnull().sum()
    missing_percent = (missing / len(df)) * 100
    missing_data = pd.concat([missing, missing_percent], axis=1)
    missing_data.columns = ['Count', 'Percent']

    missing_values = missing_data[missing_data['Count'] > 0].sort_values('Count')
    if not missing_values.empty:
        print("\n- Columns with missing values:")
        print(missing_values)
    else:
        print("\nNo missing values found in the dataset.")


missing_values_train = missing_values(train)
missing_values_train_extra = missing_values(train_extra)


### 3.3. Duplicate Rows

In [None]:
# Duplicates
def duplicates(df):
    print("DUPLICATE ROWS ANALYSIS")
    print("-----------------------")

    duplicates = df[df.duplicated(keep=False)]
    print("\nChecking for fully duplicated rows")

    if not duplicates.empty:
        print(f"\nFound {len(duplicates)} duplicate rows")
        print("\nSample of duplicate rows:")
        print(duplicates.head())
    else:
        print("\nNo duplicate rows found")


duplicates_train = duplicates(train)
duplicates_train_extra = duplicates(train_extra)

### 3.4. Target Variable

In [None]:
# Target Variable
def target_variable(df):
    print("TARGET VARIABLE ANALYSIS")
    print("------------------------")

    target_col = 'Price_Category'

    if target_col not in df.columns:
        print(f"Error: Target column '{target_col}' not found in the dataset.")
        return


    print("\n- Price statistics:")
    print(df['Price'].describe())


    # Boxplot
    plt.figure(figsize=(8, 6))
    sns.boxplot(y=df['Price'])
    plt.title('Box Plot of Price', fontsize=15)
    plt.tight_layout()
    plt.show()


target_variable_train = target_variable(train)
target_variable_train_extra = target_variable(train_extra)

### 3.5. Feature Distributions

In [None]:
def feature_distribution(df):
    print("FEATURE DISTRIBUTION ANALYSIS")
    print("-----------------------------")

    target_col = 'Price_Category'

    numerical_cols = [
        col for col in df.select_dtypes(include=['int64', 'float64']).columns
        if col not in [target_col, "id", "Price"]
    ]

    if numerical_cols:
        print(f"\n- Found {len(numerical_cols)} numerical features")

        rows = 1
        cols = 2

        # Histogram
        plt.figure(figsize=(5 * cols, 4 * rows))
        for i, col in enumerate(numerical_cols):
            plt.subplot(rows, cols, i + 1)
            sns.histplot(data=df, x=col, kde=True)
            plt.title(col)

        plt.tight_layout()
        plt.show()

        # Boxplot
        plt.figure(figsize=(5 * cols, 4 * rows))
        for i, col in enumerate(numerical_cols):
            plt.subplot(rows, cols, i + 1)
            sns.boxplot(data=df, y=col)
            plt.title(col)

        plt.tight_layout()
        plt.show()


    else:
        print("\n- No numerical features found (excluding target)")

    columns_to_analyze = [col for col in df.columns if col != target_col and col != 'id' and col != 'Price']
    categorical_cols = [col for col in df[columns_to_analyze].select_dtypes(include=['object', 'category']).columns]

    if target_col in categorical_cols:
        categorical_cols = categorical_cols.remove(target_col)

    if categorical_cols:
        print(f"\n- Found {len(categorical_cols)} categorical features")

        for col in categorical_cols:
            value_counts = df[col].value_counts()
            unique_count = len(value_counts)

            print(f"   - {col}: {unique_count} unique values")

            plt.figure(figsize=(12, 6))
            sns.countplot(y=col, data=df, order=value_counts.index)
            plt.title(f'Count of {col}', fontsize=15)

            plt.tight_layout()
            plt.show()

        if len(categorical_cols) > 10:
            print(f"   (and {len(categorical_cols) - 10} more categorical features)")

    else:
        print("\n- No categorical features found (excluding target)")


feature_distribution_train = feature_distribution(train)
feature_distribution_train_extra = feature_distribution(train_extra)

### 3.6. Features Correlations to Price

In [None]:
def correlation_analysis(df):
    print("CORRELATION ANALYSIS")
    print("---------------------")

    target_col = 'Price_Category'
    correlation_timeout = 120  # seconds
    results = {}

    if target_col not in df.columns:
        print(f"Error: Target column '{target_col}' not found in the dataset.")
        return results

    columns_to_analyze = [col for col in df.columns if col not in [target_col, 'id', 'Price']]

    numerical_cols = [col for col in df[columns_to_analyze].select_dtypes(include=['int64', 'float64']).columns]
    categorical_cols = [col for col in df[columns_to_analyze].select_dtypes(include=['object', 'category']).columns]

    biserial_cols = ['Laptop Compartment', 'Waterproof']

    start_time = time.time()

    # Map the Price_Category to numbers
    if df[target_col].dtype == 'object' or df[target_col].dtype == 'category':
        unique_categories = df[target_col].unique()
        mapping = {cat: i for i, cat in enumerate(unique_categories)}
        target_numeric = df[target_col].map(mapping)
    else:
        target_numeric = df[target_col]

    correlation_results = []

    print("\n" + "-" * 50)
    print("NUMERICAL FEATURES - SPEARMAN CORRELATION")
    print("-" * 50)
    print(f"{'Feature':<20} {'Correlation':<12} {'p-value':<12} {'Significance'}")
    print("-" * 70)

    # Analyze numerical columns using Spearman
    numerical_results = []
    for col in numerical_cols:
        if df[col].isna().any():
            valid_data = df[[col, target_col]].dropna()
            if len(valid_data) < 2:
                continue
            corr, p_value = spearmanr(valid_data[col], target_numeric[valid_data.index])
        else:
            corr, p_value = spearmanr(df[col], target_numeric)

        sig_status = "significant" if p_value < 0.05 else "not significant"
        print(f"{col:<20} {corr:>10.4f}   {p_value:>10.4f}   {sig_status}")

        numerical_results.append({
            'feature': col,
            'method': 'Spearman',
            'correlation': corr,
            'p_value': p_value,
            'significance': p_value < 0.05,
            'feature_type': 'Numerical'
        })
        correlation_results.append(numerical_results[-1])


    print("\n" + "-" * 50)
    print("ORDINAL FEATURES - SPEARMAN CORRELATION")
    print("-" * 50)
    print(f"{'Feature':<20} {'Correlation':<12} {'p-value':<12} {'Significance'}")
    print("-" * 70)

    # Analyze ordinal columns using Spearman
    ordinal_results = []
    for col in ['Size']:
        valid_data = df[[col, target_col]].dropna()
        nan_count = len(df) - len(valid_data)
        if nan_count > 0:
            print(f"{col:<20} - Processing without {nan_count} NaN values ({nan_count / len(df) * 100:.1f}%)")

        if df[col].dtype == 'object' or df[col].dtype == 'category':
            unique_values = valid_data[col].unique()
            mapping = {val: i for i, val in enumerate(unique_values)}
            col_numeric = valid_data[col].map(mapping)
        else:
            col_numeric = valid_data[col]

        valid_target = target_numeric[valid_data.index]
        corr, p_value = spearmanr(col_numeric, valid_target)

        sig_status = "significant" if p_value < 0.05 else "not significant"
        print(f"{col:<20} {corr:>10.4f}   {p_value:>10.4f}   {sig_status}")

        ordinal_results.append({
            'feature': col,
            'method': 'Spearman',
            'correlation': corr,
            'p_value': p_value,
            'significance': p_value < 0.05,
            'feature_type': 'Ordinal'
        })
        correlation_results.append(ordinal_results[-1])


    print("\n" + "-" * 50)
    print("CATEGORICAL FEATURES - CHI-SQUARE TEST")
    print("-" * 50)
    print(f"{'Feature':<20} {'Cramer\'s V':<12} {'p-value':<12} {'Significance'}")
    print("-" * 70)

    # Analyze categorical columns using Chi-square test
    categorical_results = []
    for col in categorical_cols:
        if col == 'Size':
            continue

        valid_data = df[[col, target_col]].dropna()
        nan_count = len(df) - len(valid_data)
        if nan_count > 0:
            print(f"{col:<20} - Processing without {nan_count} NaN values ({nan_count / len(df) * 100:.1f}%)")

        try:
            # Create contingency table
            contingency_table = pd.crosstab(valid_data[col], valid_data[target_col])

            # Check if contingency table is valid for chi-square test
            if contingency_table.shape[0] < 2 or contingency_table.shape[1] < 2:
                print(f"{col:<20} {'N/A':<12} {'N/A':<12} {'N/A - Not enough unique values'}")
                continue

            # Check if expected frequencies are too small
            chi2, p_value, dof, expected = chi2_contingency(contingency_table)

            if (expected < 5).any():
                print(f"{col:<20} - Warning: Some expected frequencies < 5")

            # Calculate Cramer's V as a measure of association strength
            n = contingency_table.sum().sum()
            min_dim = min(contingency_table.shape) - 1
            if min_dim == 0:
                cramers_v = 0
            else:
                cramers_v = np.sqrt(chi2 / (n * min_dim))

            sig_status = "significant" if p_value < 0.05 else "not significant"
            print(f"{col:<20} {cramers_v:>10.4f}   {p_value:>10.4f}   {sig_status}")

            categorical_results.append({
                'feature': col,
                'method': 'Chi-square',
                'correlation': cramers_v,
                'p_value': p_value,
                'significance': p_value < 0.05,
                'feature_type': 'Nominal'
            })
            correlation_results.append(categorical_results[-1])
        except Exception as e:
            print(f"{col:<20} {'Error':<12} {'Error':<12} {'Error: ' + str(e)}")


    print("\n" + "-" * 50)
    print("BINARY FEATURES - POINT-BISERIAL CORRELATION")
    print("-" * 50)
    print(f"{'Feature':<20} {'Correlation':<12} {'p-value':<12} {'Significance'}")
    print("-" * 70)

    # Analyze binary columns using Point-Biserial correlation
    binary_results = []
    for col in biserial_cols:
        valid_data = df[[col, target_col]].dropna()
        nan_count = len(df) - len(valid_data)
        if nan_count > 0:
            print(f"{col:<20} - Processing without {nan_count} NaN values ({nan_count / len(df) * 100:.1f}%)")

        if valid_data[col].nunique() != 2:
            print(f"{col:<20} {'N/A':<12} {'N/A':<12} {'N/A - Not a binary column'}")
            continue

        try:
            # Convert binary column to 0/1
            binary_col = pd.factorize(valid_data[col])[0]
            valid_target = target_numeric[valid_data.index]
            corr, p_value = pointbiserialr(binary_col, valid_target)

            sig_status = "significant" if p_value < 0.05 else "not significant"
            print(f"{col:<20} {corr:>10.4f}   {p_value:>10.4f}   {sig_status}")

            binary_results.append({
                'feature': col,
                'method': 'Point-Biserial',
                'correlation': corr,
                'p_value': p_value,
                'significance': p_value < 0.05,
                'feature_type': 'Binary'
            })
            correlation_results.append(binary_results[-1])
        except Exception as e:
            print(f"{col:<20} {'Error':<12} {'Error':<12} {'Error: ' + str(e)}")

    if time.time() - start_time > correlation_timeout:
        print(f"\nCorrelation analysis timed out after {correlation_timeout} seconds")
        return results

    if correlation_results:
        results_df = pd.DataFrame(correlation_results)
        results_df['abs_corr'] = results_df['correlation'].abs()
        results_df = results_df.sort_values('abs_corr', ascending=False)

        all_correlations = results_df.copy()
        results_df = results_df.drop('abs_corr', axis=1)
        results['all_correlations'] = all_correlations

        print("\n" + "=" * 50)
        print("CORRELATIONS SUMMARY")
        print("=" * 50)

        results = results_df.head(10)
        for _, row in results.iterrows():
            sig = "significant" if row['significance'] else "not significant"
            print(f"{row['feature']} ({row['feature_type']}): {row['correlation']:.4f} ({row['method']}, {sig})")


feature_correlation_train = correlation_analysis(train)
feature_correlation_train_extra = correlation_analysis(train_extra)

# 4. Data Preprocessing
Prepare the data for model training by encoding categorical variables and scaling numerical features.

### 4.1. Data Cleaning and Imputation
- Drops rows with more than one missing value
- Fills missing values based on column type(mode for categorical and median for numeric)

In [None]:
counter = 0
def clean_and_impute(df, drop_rows_with_many_missing=True, name="dataset"):
    global counter
    counter += 1
    print(f"[Call #{counter}] Processing {name}...")
    df_clean = df.copy()
    n_before = df_clean.shape[0]
    if drop_rows_with_many_missing:
        missing_rows = df_clean.isnull().sum(axis=1)
        n_dropped = (missing_rows > 1).sum()
        print(f"{name}: Dropping {n_dropped} rows with >1 missing value.")
        df_clean = df_clean[missing_rows <= 1]
    else:
        n_dropped = 0
    n_filled = 0
    for col in df_clean.columns:
        n_missing = df_clean[col].isnull().sum()
        if n_missing > 0:
            if df_clean[col].dtype == 'object':
                mode_val = df_clean[col].mode(dropna=True)
                if not mode_val.empty:
                    df_clean[col] = df_clean[col].fillna(mode_val[0])
            else:
                median_val = df_clean[col].median(skipna=True)
                df_clean[col] = df_clean[col].fillna(median_val)
            n_filled += n_missing
    print(f"{name}: Filled {n_filled} missing values.")
    print(f"{name}: Final shape after cleaning: {df_clean.shape}")
    return df_clean

train_clean = clean_and_impute(train, drop_rows_with_many_missing=True, name="Train")
train_extra_clean = clean_and_impute(train_extra, drop_rows_with_many_missing=True, name="Training Extra")

print("\nTrain (cleaned) sample:")
display(train_clean.head())
print("\nTraining Extra (cleaned) sample:")
display(train_extra_clean.head())

train_clean.to_csv('Data/train_cleaned.csv', index=False)
train_extra_clean.to_csv('Data/training_extra_cleaned.csv', index=False)

### 4.2. Categorical Variables Encoding and Normalization
- "Color" was encoded using Target Encoding (replaces categories with the mean of the target variable for each group)
- "Size" and "Price_Category" were encoded using Ordinal Encoding (substitutes categories with numeric values)
- The rest of categorical variables uses One-Hot Encoding (converting each category into a binary column)
- Numerical variables were normalized via standardization (scaling them to have zero mean and unit variance for balanced model training)
- "Price" was excluded during pre-processing, the other features were maintained since the correlation analysis made above wasn't conclusive

In [None]:
target_col = 'Price_Category'
onehot_cols = ['Brand', 'Material', 'Laptop Compartment', 'Waterproof', 'Style']
ordinal_cols = ['Size']
target_encode_cols = ['Color']
numerical_cols = ['Compartments', 'Weight Capacity (kg)']

X_train = train_clean

target_encoder = OrdinalEncoder()
y_numeric = target_encoder.fit_transform(X_train[[target_col]]).ravel()

preprocessor = ColumnTransformer([
    ('onehot', OneHotEncoder(drop=None, sparse_output=False), onehot_cols),
    ('ordinal', OrdinalEncoder(categories=[['Small', 'Medium', 'Large']]), ordinal_cols),
    ('target', TargetEncoder(), target_encode_cols),
    ('num', StandardScaler(), numerical_cols)
], remainder='drop')

X = X_train[onehot_cols + ordinal_cols + target_encode_cols + numerical_cols]
X_processed = preprocessor.fit_transform(X, y_numeric)

# Get the actual feature names from the preprocessor
feature_names = preprocessor.get_feature_names_out()
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)
y = X_train[target_col]

print('Train processed feature shape:', X_processed_df.shape)

# Process train_extra dataset
X_train_extra = train_extra_clean.copy()

# Convert target to numeric using the same OrdinalEncoder
y_extra_numeric = target_encoder.transform(X_train_extra[[target_col]]).ravel()

preprocessor_extra = ColumnTransformer([
    ('onehot', OneHotEncoder(drop=None, sparse_output=False), onehot_cols),
    ('ordinal', OrdinalEncoder(categories=[['Small', 'Medium', 'Large']]), ordinal_cols),
    ('target', TargetEncoder(), target_encode_cols),
    ('num', StandardScaler(), numerical_cols)
], remainder='drop')

X_extra = X_train_extra[onehot_cols + ordinal_cols + target_encode_cols + numerical_cols]
X_extra_processed = preprocessor_extra.fit_transform(X_extra, y_extra_numeric)

# Get the actual feature names from the preprocessor
feature_names_extra = preprocessor_extra.get_feature_names_out()
X_extra_processed_df = pd.DataFrame(X_extra_processed, columns=feature_names_extra)
y_extra = X_train_extra[target_col]

print('Train extra processed feature shape:', X_extra_processed_df.shape)

# 5. Model Training and Evaluation
Train and evaluate models one at a time.

In [None]:
# Combine both the datasets
X_combined = pd.concat([X_processed_df, X_extra_processed_df], axis=0, ignore_index=True)
y_combined = pd.concat([y, y_extra], axis=0, ignore_index=True)

print(f"Combined dataset shape: {X_combined.shape}")
print(f"Combined target distribution:\n{y_combined.value_counts()}")

# Split the train datasets to have some validation
X_train, X_val, y_train, y_val = train_test_split(
        X_combined, y_combined, test_size=0.2, random_state=42, stratify=y_combined)

print("Using validation split strategy")
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")

### 5.1. Decision Tree

In [None]:
models = {}
results = {}

print("TRAINING DECISION TREE")
print("----------------------")

dt_params = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

dt = DecisionTreeClassifier(random_state=42)
dt_grid = GridSearchCV(dt, dt_params, cv=5, scoring='accuracy', n_jobs=-1)
dt_grid.fit(X_train, y_train)

models['Decision Tree'] = dt_grid.best_estimator_
print(f"Best DT params: {dt_grid.best_params_}")
print(f"Best DT CV score: {dt_grid.best_score_:.4f}")

### 5.2. k-Nearest Neighbors

In [None]:
"""print("TRAINING K-NEAREST NEIGHBORS")
print("----------------------------")

knn_params = {
    'n_neighbors': [3, 5, 7, 9, 11, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

knn = KNeighborsClassifier()
knn_grid = GridSearchCV(knn, knn_params, cv=5, scoring='accuracy', n_jobs=-1)
knn_grid.fit(X_train, y_train)

models['KNN'] = knn_grid.best_estimator_
print(f"Best KNN params: {knn_grid.best_params_}")
print(f"Best KNN CV score: {knn_grid.best_score_:.4f}")"""

### 5.3. Neural Networks

In [None]:
"""print("TRAINING NEURAL NETWORKS (MLP)")
print("------------------------------")

nn_params = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive']
}

nn = MLPClassifier(random_state=42, max_iter=1000)
nn_grid = GridSearchCV(nn, nn_params, cv=5, scoring='accuracy', n_jobs=-1)
nn_grid.fit(X_train, y_train)

models['Neural Network'] = nn_grid.best_estimator_
print(f"Best NN params: {nn_grid.best_params_}")
print(f"Best NN CV score: {nn_grid.best_score_:.4f}")"""

### 5.4. Support Vector Machine

In [None]:
"""print("TRAINING SUPPORT VECTOR MACHINES")
print("--------------------------------")


svm_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1]
}

svm = SVC(random_state=42)
svm_grid = GridSearchCV(svm, svm_params, cv=5, scoring='accuracy', n_jobs=-1)
svm_grid.fit(X_train, y_train)

models['SVM'] = svm_grid.best_estimator_
print(f"Best SVM params: {svm_grid.best_params_}")
print(f"Best SVM CV score: {svm_grid.best_score_:.4f}")"""

# 6. Comparison of Results
Compare the performance of all models.

In [None]:
def evaluate_model(model, X_train, X_val, y_train, y_val, model_name):
    print(f"\nEvaluating {model_name}...")

    # Measure training time
    start_time = time.time()
    model.fit(X_train, y_train)
    fit_time = time.time() - start_time

    # Measure prediction time
    start_time = time.time()
    y_pred = model.predict(X_val)
    pred_time = time.time() - start_time

    # Calculate metrics
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, average='weighted')
    rec = recall_score(y_val, y_pred, average='weighted')
    f1 = f1_score(y_val, y_pred, average='weighted')

    return {
        'Model': model_name,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1': f1,
        'Fit Time (s)': fit_time,
        'Pred Time (s)': pred_time
    }

# Evaluate all models
print("EVALUATING ALL MODELS ON VALIDATION SET")
print("=" * 50)

results_list = []
for model_name, model in models.items():
    result = evaluate_model(model, X_train, X_val, y_train, y_val, model_name)
    results_list.append(result)

    # Print individual results
    print(f"\n{model_name} Results:")
    print(f"  Accuracy:    {result['Accuracy']:.4f}")
    print(f"  Precision:   {result['Precision']:.4f}")
    print(f"  Recall:      {result['Recall']:.4f}")
    print(f"  F1 Score:    {result['F1']:.4f}")
    print(f"  Fit Time:    {result['Fit Time (s)']:.4f}s")
    print(f"  Pred Time:   {result['Pred Time (s)']:.6f}s")

# Create comprehensive results DataFrame
results_df = pd.DataFrame(results_list)

print("\n" + "=" * 80)
print("COMPREHENSIVE RESULTS COMPARISON")
print("=" * 80)
print(results_df.round(4))

# Find best performing model for each metric
print("\n" + "=" * 50)
print("BEST PERFORMING MODELS BY METRIC")
print("=" * 50)

metrics = ['Accuracy', 'Precision', 'Recall', 'F1']
for metric in metrics:
    best_idx = results_df[metric].idxmax()
    best_model = results_df.loc[best_idx, 'Model']
    best_score = results_df.loc[best_idx, metric]
    print(f"Best {metric:10}: {best_model:15} ({best_score:.4f})")

# Find fastest models
fastest_fit_idx = results_df['Fit Time (s)'].idxmin()
fastest_pred_idx = results_df['Pred Time (s)'].idxmin()

print(f"Fastest Fit:   {results_df.loc[fastest_fit_idx, 'Model']:15} ({results_df.loc[fastest_fit_idx, 'Fit Time (s)']:.4f}s)")
print(f"Fastest Pred:  {results_df.loc[fastest_pred_idx, 'Model']:15} ({results_df.loc[fastest_pred_idx, 'Pred Time (s)']:.6f}s)")

# Create visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

# Performance metrics
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1']
for i, metric in enumerate(metrics_to_plot):
    ax = axes[0, i] if i < 2 else axes[1, i-2]
    bars = ax.bar(results_df['Model'], results_df[metric],
                  color=['skyblue', 'lightcoral', 'lightgreen', 'gold'])
    ax.set_title(f'{metric} Comparison')
    ax.set_ylabel(metric)
    ax.tick_params(axis='x', rotation=45)

    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                f'{height:.3f}', ha='center', va='bottom')

# Training time comparison
ax = axes[1, 2]
bars = ax.bar(results_df['Model'], results_df['Fit Time (s)'],
              color='orange', alpha=0.7)
ax.set_title('Training Time Comparison')
ax.set_ylabel('Time (seconds)')
ax.tick_params(axis='x', rotation=45)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + height*0.02,
            f'{height:.3f}s', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Create a performance ranking
print("\n" + "=" * 50)
print("OVERALL PERFORMANCE RANKING")
print("=" * 50)

# Calculate composite score (you can adjust weights as needed)
weights = {'Accuracy': 0.3, 'Precision': 0.25, 'Recall': 0.25, 'F1': 0.2}
results_df['Composite_Score'] = sum(results_df[metric] * weight
                                   for metric, weight in weights.items())

# Sort by composite score
ranking_df = results_df.sort_values('Composite_Score', ascending=False)
print("Ranking based on weighted composite score:")
print("(Accuracy: 30%, Precision: 25%, Recall: 25%, F1: 20%)")
print()

for i, (_, row) in enumerate(ranking_df.iterrows(), 1):
    print(f"{i}. {row['Model']:15} - Composite Score: {row['Composite_Score']:.4f}")

# Detailed comparison table for easy copying
print("\n" + "=" * 100)
print("DETAILED RESULTS TABLE")
print("=" * 100)
detailed_results = results_df[['Model', 'Accuracy', 'Precision', 'Recall', 'F1',
                              'Fit Time (s)', 'Pred Time (s)']].round(4)
print(detailed_results.to_string(index=False))

# Performance vs Speed Analysis
print("\n" + "=" * 50)
print("PERFORMANCE vs SPEED ANALYSIS")
print("=" * 50)

for _, row in results_df.iterrows():
    model_name = row['Model']
    accuracy = row['Accuracy']
    fit_time = row['Fit Time (s)']
    pred_time = row['Pred Time (s)']

    # Performance tier
    if accuracy >= 0.95:
        perf_tier = "Excellent"
    elif accuracy >= 0.90:
        perf_tier = "Good"
    elif accuracy >= 0.85:
        perf_tier = "Fair"
    else:
        perf_tier = "Poor"

    # Speed tier (for training)
    if fit_time <= 0.1:
        speed_tier = "Very Fast"
    elif fit_time <= 1.0:
        speed_tier = "Fast"
    elif fit_time <= 10.0:
        speed_tier = "Moderate"
    else:
        speed_tier = "Slow"

    print(f"{model_name:15} - {perf_tier:9} Performance, {speed_tier:10} Training")

# Save results to CSV for further analysis
results_df.to_csv('model_comparison_results.csv', index=False)
print(f"\nResults saved to 'model_comparison_results.csv'")