In [1]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from functools import reduce
import traceback
from IPython.display import display, HTML # For better display of multiple dataframes
import re
import os

In [2]:
participant_file_path = 'GenAIEyeTrackingCleanedDataset/ParticipantList.xlsx'
participant_df_global = pd.read_excel(
    participant_file_path,
    sheet_name='GENAI',
    header=2,  # Row number 3 is the header row
    usecols=['Gender', 'Participant ID']  #Pick column Gender and Participant ID only
)

# Rename Participant ID
participant_df_global = participant_df_global.rename(columns={'Participant ID': 'Participant_ID'})

# Drop rows where Gender or Participant_ID is missing
participant_df_global = participant_df_global.dropna(subset=['Gender', 'Participant_ID'])

# Drop duplicate Participant_ID entries, keeping the first occurrence
participant_df_global = participant_df_global.drop_duplicates(subset='Participant_ID', keep='first')


In [3]:
# Set the correct base path for all Excel files
base_path = 'GenAIEyeTrackingCleanedDataset/'  # Make sure this is correct relative to your notebook

questions_config = {
    'Q1': {
        'file_path': os.path.join(base_path, 'Filtered_GenAI_Metrics_cleaned_Q1.xlsx'),
        'aoi_columns': ['1 Eyebrow A', '1 Eyebrow B', '1 Eyes A', '1 Eyes B', '1 Hair A', '1 Hair B', '1 Nose A', '1 Nose B']
    },
    'Q2': {
        'file_path': os.path.join(base_path, 'Filtered_GenAI_Metrics_cleaned_Q2.xlsx'),
        'aoi_columns': ['2 Body A', '2 Body B', '2 Face A', '2 Face B', '2 Hair A', '2 Hair B']
    },
    'Q3': {
        'file_path': os.path.join(base_path, 'Filtered_GenAI_Metrics_cleaned_Q3.xlsx'),
        'aoi_columns': ['3 Back Mountain A', '3 Back Mountain B', '3 Front Mountain A', '3 Front Mountain B', '3 Midground A', '3 Midground B', '3 Plain A', '3 River B', '3 Sky A', '3 Sky B']
    },
    'Q4': {
        'file_path': os.path.join(base_path, 'Filtered_GenAI_Metrics_cleaned_Q4.xlsx'),
        'aoi_columns': ['4 Chilli B', '4 Jalapeno B', '4 Mushroom A1', '4 Mushroom A2', '4 Mushroom B', '4 Olive A', '4 Pepperoni A', '4 Pepperoni B']
    },
    'Q5': {
        'file_path': os.path.join(base_path, 'Filtered_GenAI_Metrics_cleaned_Q5.xlsx'),
        'aoi_columns': ['5 Sea A', '5 Sea B', '5 Sky A', '5 Sky B']
    },
    'Q6': {
        'file_path': os.path.join(base_path, 'Filtered_GenAI_Metrics_cleaned_Q6.xlsx'),
        'aoi_columns': ['6 Background B1','6 Background B2','6 Flower A', '6 Flower B', '6 Inside A', '6 Inside B', '6 Leaf A', '6 Leaf B', '6 Sky A', '6 Sky B']
    }
}

selected_metric_sheets = [
    "Tot Fixation dur",
    "Fixation count",
    "Time to first Fixation",
    "Tot Visit dur"
]

master_sheet_for_balancing_global = 'Tot Fixation dur'

# Dictionaries to store dataframes for each question
all_data_sheets = {}
all_metrics_dfs = {}
all_cleaned_metrics_dfs = {}
all_y_resampled_gender = {}
all_X_repr = {}
all_y_repr = {}
all_actual_aoi_cols_in_master = {}
all_balanced_unified_dfs = {}
all_merged_long_dfs = {}

In [4]:
for q_name, config in questions_config.items():
    current_file_path = config['file_path']

    try:
        xls = pd.ExcelFile(current_file_path)
        sheet_names = xls.sheet_names

        data_sheets_qN = {sheet_name: xls.parse(sheet_name) for sheet_name in sheet_names}
        all_data_sheets[q_name] = data_sheets_qN

    except FileNotFoundError:
        print(f"ERROR: File not found for {q_name} at {current_file_path}")
        all_data_sheets[q_name] = {} # Store empty dict to avoid later errors
    except Exception as e:
        print(f"ERROR: Could not process file for {q_name} at {current_file_path}. Error: {e}")
        all_data_sheets[q_name] = {}

In [5]:
for q_name, config in questions_config.items():
    data_sheets_qN = all_data_sheets.get(q_name, {}) # Retrieve the loaded sheets for the current question

    # Check if data sheets were loaded successfully for the current question
    if not data_sheets_qN:
        all_metrics_dfs[q_name] = {} # Initialize as empty for this question
        all_cleaned_metrics_dfs[q_name] = {} # Initialize as empty for this question
        continue

    # Dictionary to store metric dataframes with merged gender information for the current question
    metrics_dfs_qN = {}
    for sheet_name in selected_metric_sheets:
        if sheet_name in data_sheets_qN:
            df_qN = data_sheets_qN[sheet_name].copy() # Work on a copy

            # Standardize 'Participant' column name to 'Participant_ID' if necessary
            if 'Participant_ID' not in df_qN.columns and 'Participant' in df_qN.columns:
                df_qN = df_qN.rename(columns={'Participant': 'Participant_ID'})

            # Standardize Participant_ID format and merge with global participant gender data
            if 'Participant_ID' in df_qN.columns:
                df_qN['Participant_ID'] = df_qN['Participant_ID'].apply(
                    lambda x: f'P{int(str(x)[1:]):02d}' if isinstance(x, str) and str(x).startswith('P') and str(x)[1:].isdigit()
                              else (f'P{int(x):02d}' if pd.notnull(x) and isinstance(x, (int, float)) else x)
                )
                df_qN = df_qN.merge(participant_df_global, on='Participant_ID', how='left') # Merge gender

            metrics_dfs_qN[sheet_name] = df_qN # Store the processed dataframe
        else:
            metrics_dfs_qN[sheet_name] = pd.DataFrame() # Add empty df if sheet not found to avoid key errors

    all_metrics_dfs[q_name] = metrics_dfs_qN # Store all processed metric DFs for this question

    # Preview the master sheet after merging gender
    if metrics_dfs_qN.get(master_sheet_for_balancing_global) is not None and \
       not metrics_dfs_qN[master_sheet_for_balancing_global].empty:
        master_df_preview = metrics_dfs_qN[master_sheet_for_balancing_global]
        # Construct a list of columns for preview: IDs, Gender, and AOI columns present in this specific master_df
    else:
        print(f"Warning: Master sheet '{master_sheet_for_balancing_global}' not available for {q_name} preview or is empty.")

    # Dictionary to store cleaned metric dataframes (after dropping NaNs in ID/Gender)
    cleaned_metrics_dfs_qN = {}
    for sheet_name, df_qN in metrics_dfs_qN.items():
        if df_qN is not None and not df_qN.empty and \
           'Participant_ID' in df_qN.columns and 'Gender' in df_qN.columns:
            # Drop rows where Participant_ID or Gender is NaN (these are usually summary rows or failed merges)
            cleaned_df_qN = df_qN.dropna(subset=['Participant_ID', 'Gender'])
            cleaned_metrics_dfs_qN[sheet_name] = cleaned_df_qN
        else:
            print(f"Warning: DataFrame for sheet '{sheet_name}' in {q_name} is None, empty, or missing key columns ('Participant_ID', 'Gender') for cleaning.")
            cleaned_metrics_dfs_qN[sheet_name] = pd.DataFrame() # Store empty df to maintain structure

    all_cleaned_metrics_dfs[q_name] = cleaned_metrics_dfs_qN # Store all cleaned metric DFs for this question


In [6]:
for q_name in questions_config.keys():
    cleaned_metrics_dfs_qN = all_cleaned_metrics_dfs.get(q_name, {})

    if master_sheet_for_balancing_global in cleaned_metrics_dfs_qN and not cleaned_metrics_dfs_qN[master_sheet_for_balancing_global].empty:
        first_sheet_df_qN = cleaned_metrics_dfs_qN[master_sheet_for_balancing_global]
        unique_participants_qN = first_sheet_df_qN.drop_duplicates(subset=['Participant_ID'])
        counts_qN = unique_participants_qN['Gender'].value_counts()

        if not counts_qN.empty:
            labels_qN = counts_qN.index.tolist()
            values_qN = counts_qN.tolist()


            gender_color_map = {
                'Male': '#1f77b4',
                'Female': '#ff69b4'
            }

            # Map colors in the same order as labels
            colors_pie = [gender_color_map.get(label, '#CCCCCC') for label in labels_qN]

            fig_qN_orig = px.pie(
                values=values_qN,
                names=labels_qN,
                width=700,
                height=400,
                color_discrete_sequence=colors_pie,
                title=f"Unique Participant Gender Distribution ({q_name} - Original)"
            )
        else:
            print(f"No gender data to plot for {q_name} after deduplication.")
    else:
        print(f"Skipping original gender distribution plot for {q_name} as master sheet ('{master_sheet_for_balancing_global}') is empty or missing.")


In [7]:
# Original imports in the cell (ensure SMOTE is available)
from imblearn.over_sampling import SMOTE # Make sure this is imported
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
import traceback

# First loop (Preparing Representative Data) remains the same:
for q_name, config in questions_config.items():

    current_aoi_columns = config['aoi_columns']
    cleaned_metrics_dfs_qN = all_cleaned_metrics_dfs.get(q_name, {})

    # Initialize storage
    all_y_resampled_gender[q_name] = None
    all_X_repr[q_name] = pd.DataFrame()
    all_y_repr[q_name] = pd.Series(dtype='object')
    all_actual_aoi_cols_in_master[q_name] = []

    # Ensure master_sheet_for_balancing_global exists and is not empty
    if master_sheet_for_balancing_global not in cleaned_metrics_dfs_qN or cleaned_metrics_dfs_qN[master_sheet_for_balancing_global].empty:
        print(f"Master sheet '{master_sheet_for_balancing_global}' not found or empty for {q_name}. Skipping balancing preparation.")
        continue

    df_master_original_qN = cleaned_metrics_dfs_qN[master_sheet_for_balancing_global].copy()
    actual_aoi_cols_qN = [col for col in current_aoi_columns if col in df_master_original_qN.columns]
    all_actual_aoi_cols_in_master[q_name] = actual_aoi_cols_qN

    if not actual_aoi_cols_qN:
        print(f"No AOI columns found in master sheet for {q_name}. Skipping balancing preparation.")
        continue

    df_representative_qN = df_master_original_qN.groupby('Participant_ID', as_index=False).first()

    # Ensure 'Gender' column exists after groupby
    if 'Gender' not in df_representative_qN.columns:
        print(f"'Gender' column missing in representative data for {q_name}. Skipping balancing preparation.")
        continue

    df_representative_for_smote_qN = df_representative_qN[['Gender'] + actual_aoi_cols_qN].copy()

    for col in actual_aoi_cols_qN:
        df_representative_for_smote_qN[col] = pd.to_numeric(df_representative_for_smote_qN[col], errors='coerce').fillna(0)

    df_representative_for_smote_qN.dropna(subset=['Gender'], inplace=True)
    df_representative_for_smote_qN.dropna(subset=actual_aoi_cols_qN, how='all', inplace=True) # Drop if all AOI cols are NaN (now 0)

    gender_counts_debug = df_representative_for_smote_qN['Gender'].value_counts()
    if df_representative_for_smote_qN.empty or len(gender_counts_debug) < 2 or gender_counts_debug.min() < 2: # SMOTE needs at least 2 samples in minority
        print(f"Not enough data for SMOTE in {q_name}. Min class count: {gender_counts_debug.min() if not gender_counts_debug.empty else 'N/A'}.")
        all_y_resampled_gender[q_name] = df_representative_for_smote_qN['Gender'] # Store original if SMOTE can't run
        temp_unified_resampled_sets[q_name] = None # Indicate no resampling done
        continue

    all_X_repr[q_name] = df_representative_for_smote_qN[actual_aoi_cols_qN]
    all_y_repr[q_name] = df_representative_for_smote_qN['Gender']


temp_unified_resampled_sets = {}

for q_name in questions_config.keys():

    X = all_X_repr.get(q_name)
    y = all_y_repr.get(q_name)
    aoi_cols = all_actual_aoi_cols_in_master.get(q_name, [])

    # Default to original y if X or y is not suitable for resampling
    all_y_resampled_gender[q_name] = y.copy() if y is not None else pd.Series(dtype='object')
    temp_unified_resampled_sets[q_name] = None # Default to no resampled set

    if X is None or X.empty or y is None or y.empty:
        print(f"Skipping {q_name}: Input data (X or y) for SMOTE is missing or empty.")
        # If original data exists, reconstruct it for temp_unified_resampled_sets if needed for consistency,
        # though this part is usually for post-SMOTE data.
        if X is not None and not X.empty and y is not None and not y.empty:
            original_df_for_fallback = X.copy()
            original_df_for_fallback['Gender'] = y
            original_df_for_fallback['Participant_ID'] = [f"Original_{q_name}_P{i:03d}" for i in range(len(original_df_for_fallback))]
            temp_unified_resampled_sets[q_name] = original_df_for_fallback
        continue

    if len(y.unique()) < 2:
        print(f"Skipping {q_name}: Not enough class variety for SMOTE (needs at least 2 classes). Found: {len(y.unique())}")
        # Store original data if not resampled
        original_df_for_fallback = X.copy()
        original_df_for_fallback['Gender'] = y
        original_df_for_fallback['Participant_ID'] = [f"Original_{q_name}_P{i:03d}" for i in range(len(original_df_for_fallback))]
        temp_unified_resampled_sets[q_name] = original_df_for_fallback
        continue

    min_class_count = y.value_counts().min()
    # k_neighbors for SMOTE must be less than the number of samples in the minority class
    k_smote = max(1, min(5, min_class_count - 1))

    if min_class_count <= k_smote: # This check ensures k_neighbors is valid
        print(f"Skipping SMOTE for {q_name}: Minority class size ({min_class_count}) is too small for k_neighbors={k_smote}. Using original data.")
        # Store original data
        original_df_for_fallback = X.copy()
        original_df_for_fallback['Gender'] = y
        original_df_for_fallback['Participant_ID'] = [f"Original_{q_name}_P{i:03d}" for i in range(len(original_df_for_fallback))]
        temp_unified_resampled_sets[q_name] = original_df_for_fallback
        continue


    try:
        smote_sampler = SMOTE(random_state=161223, k_neighbors=k_smote, sampling_strategy='auto')
        X_resampled, y_resampled = smote_sampler.fit_resample(X, y)

        all_y_resampled_gender[q_name] = y_resampled # Store the resampled gender series

        df_resampled = pd.DataFrame(X_resampled, columns=aoi_cols)
        df_resampled['Gender'] = y_resampled
        # Create new participant IDs for the resampled data
        df_resampled['Participant_ID'] = [f"Balanced_{q_name}_P{i:03d}" for i in range(len(df_resampled))]

        temp_unified_resampled_sets[q_name] = df_resampled

    except Exception as e:
        print(f"SMOTE failed for {q_name}. Using original data. Error: {e}")
        traceback.print_exc()
        # Store original data if SMOTE failed
        original_df_for_fallback = X.copy()
        original_df_for_fallback['Gender'] = y
        original_df_for_fallback['Participant_ID'] = [f"Original_{q_name}_P{i:03d}" for i in range(len(original_df_for_fallback))]
        temp_unified_resampled_sets[q_name] = original_df_for_fallback
        # Ensure all_y_resampled_gender still holds the original y
        all_y_resampled_gender[q_name] = y.copy()

In [8]:
for q_name, config in questions_config.items():
    current_aoi_columns_from_config = config['aoi_columns']
    cleaned_metrics_dfs_qN = all_cleaned_metrics_dfs.get(q_name, {})
    unified_resampled_master_set_qN = temp_unified_resampled_sets.get(q_name) # Data from SMOTE/fallback for the master sheet

    current_q_reconstructed_dfs = {}

    if unified_resampled_master_set_qN is not None and not unified_resampled_master_set_qN.empty:
        is_smote_applied = any(pid.startswith("Balanced_") for pid in unified_resampled_master_set_qN['Participant_ID'])
        status_msg = "SMOTE-balanced data" if is_smote_applied else "original representative data (SMOTE likely skipped/failed)"

        for sheet_name in selected_metric_sheets:
            df_orig_specific_sheet = cleaned_metrics_dfs_qN.get(sheet_name)

            if df_orig_specific_sheet is None or df_orig_specific_sheet.empty:
                current_q_reconstructed_dfs[sheet_name] = pd.DataFrame()
                print(f"  Original sheet '{sheet_name}' for {q_name} empty/missing. Created empty reconstructed sheet.")
                continue

            # AOI columns relevant to the current sheet AND defined in the question's config
            sheet_aoi_cols = [
                col for col in df_orig_specific_sheet.columns
                if col in current_aoi_columns_from_config and col not in ['Participant_ID', 'Gender']
            ]
            # Ensure the first descriptive column exists
            first_desc_col_name = df_orig_specific_sheet.columns[0] if len(df_orig_specific_sheet.columns) > 0 else f"Desc_{sheet_name.replace(' ', '_')}"


            reconstructed_rows = []
            for _, master_row in unified_resampled_master_set_qN.iterrows():
                pid, gender = master_row['Participant_ID'], master_row['Gender']
                new_row = {
                    'Participant_ID': pid,
                    'Gender': gender,
                    # Use a generic description if first_desc_col_name was set to a default
                    first_desc_col_name: f"{sheet_name.replace(' ', '_')}_{pid}"
                }

                if sheet_name == master_sheet_for_balancing_global:
                    for aoi_col in sheet_aoi_cols: # AOIs for master sheet
                        new_row[aoi_col] = master_row.get(aoi_col, 0) # Use values from SMOTE'd master
                else: # For other metric sheets
                    original_gender_group = df_orig_specific_sheet[df_orig_specific_sheet['Gender'] == gender]
                    for aoi_col in sheet_aoi_cols:
                        # Ensure aoi_col exists in original_gender_group before trying to access it
                        if aoi_col in original_gender_group:
                            numeric_series = pd.to_numeric(original_gender_group[aoi_col], errors='coerce')
                            if numeric_series.notna().any():
                                new_row[aoi_col] = numeric_series.mean()
                            else: # Fallback for this gender group if all NaNs
                                overall_numeric_series = pd.to_numeric(df_orig_specific_sheet[aoi_col], errors='coerce') if aoi_col in df_orig_specific_sheet else pd.Series(dtype='float64')
                                new_row[aoi_col] = overall_numeric_series.mean() if overall_numeric_series.notna().any() else 0
                        else: # aoi_col not in original_gender_group (should not happen if sheet_aoi_cols is derived from df_orig_specific_sheet)
                            new_row[aoi_col] = 0 # Fallback
                reconstructed_rows.append(new_row)

            df_reconstructed = pd.DataFrame(reconstructed_rows)
            # Ensure all expected columns are present, even if some AOIs were missing
            final_cols_order = [first_desc_col_name, 'Participant_ID', 'Gender'] + \
                               [col for col in current_aoi_columns_from_config if col in sheet_aoi_cols] # Use only AOIs actually processed

            # Reorder and select only existing columns to prevent KeyError
            df_reconstructed = df_reconstructed[[col for col in final_cols_order if col in df_reconstructed.columns]] if not df_reconstructed.empty else pd.DataFrame(columns=final_cols_order)

            current_q_reconstructed_dfs[sheet_name] = df_reconstructed

        all_balanced_unified_dfs[q_name] = current_q_reconstructed_dfs
    else:
        print(f"No balanced/representative master data for {q_name}. Using original cleaned data for all sheets as fallback.")
        # Ensure only selected_metric_sheets are copied
        all_balanced_unified_dfs[q_name] = {
            sheet_name: df.copy() for sheet_name, df in cleaned_metrics_dfs_qN.items()
            if sheet_name in selected_metric_sheets
        }
        # Ensure y_resampled_gender reflects original if it wasn't set during SMOTE phase
        if all_y_resampled_gender.get(q_name) is None and all_y_repr.get(q_name) is not None and not all_y_repr[q_name].empty:
             all_y_resampled_gender[q_name] = all_y_repr[q_name].copy()
        elif all_y_resampled_gender.get(q_name) is None and cleaned_metrics_dfs_qN.get(master_sheet_for_balancing_global) is not None and \
             not cleaned_metrics_dfs_qN[master_sheet_for_balancing_global].empty:
             master_df_fb = cleaned_metrics_dfs_qN[master_sheet_for_balancing_global]
             if 'Gender' in master_df_fb.columns:
                 # Ensure unique participants if taking from original master sheet
                 all_y_resampled_gender[q_name] = master_df_fb.drop_duplicates('Participant_ID')['Gender'].copy()

In [9]:
for q_name, config in questions_config.items():
    current_aoi_columns_from_config = config['aoi_columns']
    reconstructed_dfs_for_qN = all_balanced_unified_dfs.get(q_name, {})

    all_merged_long_dfs[q_name] = pd.DataFrame() # Initialize for the current question

    if not reconstructed_dfs_for_qN:
        print(f"No reconstructed/balanced dataframes found for {q_name} to transform. Skipping.")
        continue

    list_of_long_dfs_for_qN = []
    for sheet_name in selected_metric_sheets: # Iterate through the four selected metric sheets
        df_reconstructed_sheet = reconstructed_dfs_for_qN.get(sheet_name)

        if df_reconstructed_sheet is not None and not df_reconstructed_sheet.empty and \
           'Participant_ID' in df_reconstructed_sheet.columns and 'Gender' in df_reconstructed_sheet.columns:

            # Identify AOI columns present in this specific reconstructed sheet
            # These AOIs should also be part of the question's config AOIs
            aoi_cols_for_melt_in_sheet = [
                col for col in df_reconstructed_sheet.columns
                if col in current_aoi_columns_from_config and col not in ['Participant_ID', 'Gender']
            ]

            # The first column in df_reconstructed_sheet is the descriptive one (e.g., "Tot_Fixation_dur_Balanced_Q1_P000")
            # We don't want to melt this descriptive column if it's not an AOI.
            # It's usually not an AOI, but good to be safe.
            first_col_name_in_sheet = df_reconstructed_sheet.columns[0]
            id_vars_for_melt = ['Participant_ID', 'Gender']

            # Ensure all id_vars are present before melting
            if not all(v in df_reconstructed_sheet.columns for v in id_vars_for_melt):
                print(f"  Skipping sheet '{sheet_name}' for {q_name}: Missing ID variables for melting. Columns: {df_reconstructed_sheet.columns.tolist()}")
                continue

            # Filter out the descriptive first column from AOIs if it's not meant to be an AOI value
            if first_col_name_in_sheet in aoi_cols_for_melt_in_sheet and first_col_name_in_sheet not in current_aoi_columns_from_config:
                # This scenario is unlikely if aoi_cols_for_melt_in_sheet is derived correctly
                aoi_cols_for_melt_in_sheet = [col for col in aoi_cols_for_melt_in_sheet if col != first_col_name_in_sheet]

            if not aoi_cols_for_melt_in_sheet:
                print(f"  Warning: No valid AOI columns for melting in sheet '{sheet_name}' for {q_name}.")
                # If we want to keep participant/gender info even if no AOIs, we could create an empty value col
                # For now, we skip if no AOIs to melt for this metric.
                # However, it might be better to add an empty column for the metric if no AOIs,
                # to ensure the merge later on has all metric columns.
                # Let's assume for now that if there are no AOIs, the metric is not applicable.
                continue

            try:
                df_long_sheet = df_reconstructed_sheet.melt(
                    id_vars=id_vars_for_melt,
                    value_vars=aoi_cols_for_melt_in_sheet,
                    var_name='AOI',
                    value_name=sheet_name  # Use the metric sheet name as the value column name
                )
                list_of_long_dfs_for_qN.append(df_long_sheet)
            except Exception as e:
                print(f"  Error melting sheet '{sheet_name}' for {q_name}: {e}")
                print(f"    Columns in df_reconstructed_sheet: {df_reconstructed_sheet.columns.tolist()}")
                print(f"    id_vars_for_melt: {id_vars_for_melt}")
                print(f"    aoi_cols_for_melt_in_sheet: {aoi_cols_for_melt_in_sheet}")
                traceback.print_exc()
        else:
            print(f"  DataFrame for sheet '{sheet_name}' in {q_name} is empty or lacks key columns for reconstruction/melting.")

    if list_of_long_dfs_for_qN:
        try:
            # Merge all long dataframes for the current question using 'Participant_ID', 'Gender', and 'AOI' as keys
            merged_long_df_qN = reduce(
                lambda left, right: pd.merge(left, right, on=['Participant_ID', 'Gender', 'AOI'], how='outer'),
                list_of_long_dfs_for_qN
            )

            def get_image_type(aoi_name_str):
                if isinstance(aoi_name_str, str):
                    stripped_aoi = aoi_name_str.strip()
                    # Regex to match AOIs ending with " A" or " A" + digits (e.g., " A1")
                    if re.search(r'\sA\d*$', stripped_aoi):
                        return 'Real'
                    # Regex to match AOIs ending with " B" or " B" + digits (e.g., " B1")
                    elif re.search(r'\sB\d*$', stripped_aoi):
                        return 'AI'
                return 'Unknown'

            merged_long_df_qN['Image_Type'] = merged_long_df_qN['AOI'].apply(get_image_type)

            # Define the desired final column order, including Image_Type
            final_columns_order = ['Participant_ID', 'Gender', 'AOI', 'Image_Type'] + selected_metric_sheets

            # Ensure all columns in final_columns_order actually exist in merged_long_df_qN
            # and reorder
            existing_final_columns = [col for col in final_columns_order if col in merged_long_df_qN.columns]
            merged_long_df_qN = merged_long_df_qN[existing_final_columns]

            all_merged_long_dfs[q_name] = merged_long_df_qN # Store the final merged df for this question

        except Exception as e:
            print(f"Error during final merge or Image_Type addition for {q_name}: {e}")
            traceback.print_exc()
    else:
        print(f"No long format DataFrames were created to merge for {q_name}.")

In [10]:


# List to hold all the individual long-format DataFrames
all_questions_long_data = []

for q_name, long_df in all_merged_long_dfs.items():
    if long_df is not None and not long_df.empty:
        # Add a 'Question' column to identify the source of the data
        df_copy = long_df.copy()
        df_copy['Question'] = q_name
        all_questions_long_data.append(df_copy)
    else:
        print(f"Warning: Long-format DataFrame for {q_name} is empty or None. Skipping.")

# Concatenate all DataFrames in the list
if all_questions_long_data:
    final_combined_long_df = pd.concat(all_questions_long_data, ignore_index=True)

    # Reorder columns to have 'Question' earlier for better readability
    cols_order = ['Question', 'Participant_ID', 'Gender', 'AOI', 'Image_Type'] + \
                 [col for col in selected_metric_sheets if col in final_combined_long_df.columns]

    # Ensure all specified columns exist before reordering
    existing_cols_for_order = [col for col in cols_order if col in final_combined_long_df.columns]
    final_combined_long_df = final_combined_long_df[existing_cols_for_order]

else:
    print("No data was available to combine.")
    final_combined_long_df = pd.DataFrame() # Create an empty DataFrame if nothing to combine

In [11]:
import pandas as pd
from scipy.stats import shapiro, ttest_ind, mannwhitneyu
normality_info = []


for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6']:
    for metric in selected_metric_sheets: # USE selected_metric_sheets
        all_male = []
        all_female = []

        aoi_cols = all_actual_aoi_cols_in_master.get(q, [])
        df = all_cleaned_metrics_dfs[q].get(metric)

        if df is not None and not df.empty:
            for aoi in aoi_cols:
                if aoi in df.columns:
                    male_vals = df[df['Gender'] == 'Male'][aoi].dropna().tolist()
                    female_vals = df[df['Gender'] == 'Female'][aoi].dropna().tolist()
                    all_male.extend(male_vals)
                    all_female.extend(female_vals)

        if len(all_male) >= 3 and len(all_female) >= 3:
            p_male = shapiro(all_male).pvalue if len(set(all_male)) > 1 else 1.0
            p_female = shapiro(all_female).pvalue if len(set(all_female)) > 1 else 1.0

            is_normal_m = p_male > 0.05
            is_normal_f = p_female > 0.05

            test_used = 't-test' if is_normal_m and is_normal_f else 'Mann–Whitney U'
        else:
            is_normal_m = is_normal_f = None
            test_used = 'Insufficient data'

        normality_info.append({
            'Question': q,
            'Metric': metric,
            'Male Normal?': 'Yes' if is_normal_m else 'No' if is_normal_m is not None else 'N/A',
            'Female Normal?': 'Yes' if is_normal_f else 'No' if is_normal_f is not None else 'N/A',
            'Test Used': test_used
        })

normality_df = pd.DataFrame(normality_info)

In [12]:
pval_rows_data = []

for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6']:
    p_values_for_row = {}
    significance_flags_for_row = {}

    aoi_cols = all_actual_aoi_cols_in_master.get(q, [])

    for metric in selected_metric_sheets:
        all_male = []
        all_female = []
        df = all_cleaned_metrics_dfs.get(q, {}).get(metric)

        if df is not None and not df.empty:
            for aoi in aoi_cols:
                if aoi in df.columns:
                    male_vals = df[df['Gender'] == 'Male'][aoi].dropna().tolist()
                    female_vals = df[df['Gender'] == 'Female'][aoi].dropna().tolist()
                    all_male.extend(male_vals)
                    all_female.extend(female_vals)

        p_val = None
        if len(all_male) >= 3 and len(all_female) >= 3:
            p_m = shapiro(all_male).pvalue if len(set(all_male)) > 1 else 1.0
            p_f = shapiro(all_female).pvalue if len(set(all_female)) > 1 else 1.0
            if p_m > 0.05 and p_f > 0.05:
                _, p_val = ttest_ind(all_male, all_female, equal_var=(p_lev > 0.05))
            else:
                if len(all_male) > 0 and len(all_female) > 0:
                    _, p_val = mannwhitneyu(all_male, all_female, alternative='two-sided', nan_policy='propagate')
                else:
                    p_val = None
        else:
            p_val = None

        p_values_for_row[metric] = p_val
        significance_flags_for_row[metric] = p_val < 0.05 if p_val is not None else 'N/A'

    # Construct the final row dictionary
    current_row_output = {'Question': q}
    for metric_key in selected_metric_sheets:
        current_row_output[f'{metric_key} (p)'] = p_values_for_row.get(metric_key)
    for metric_key in selected_metric_sheets: #
        current_row_output[f'{metric_key} (Sig)'] = significance_flags_for_row.get(metric_key)

    # --- CORRECTED LOGIC FOR Overall Significant ---
    current_row_significances = list(significance_flags_for_row.values())

    if True in current_row_significances:
        overall_sig_value = True
    elif False in current_row_significances:
        overall_sig_value = False
    else:
        overall_sig_value = 'N/A'

    current_row_output['Overall Significant'] = overall_sig_value


    pval_rows_data.append(current_row_output)


column_order = ['Question']
for metric_col_name in selected_metric_sheets:
    column_order.append(f'{metric_col_name} (p)')
column_order.append('Overall Significant')

column_order2=['Question']
for metric_col_name in selected_metric_sheets:
    column_order2.append(f'{metric_col_name} (Sig)')
column_order2.append('Overall Significant')

summary_df = pd.DataFrame(pval_rows_data)
summary_df = summary_df.reindex(columns=column_order) # Use reindex for robust column ordering

summary_df2 = pd.DataFrame(pval_rows_data)
summary_df2 = summary_df2.reindex(columns=column_order2) # Use reindex for robust column ordering

In [13]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML, clear_output
import numpy as np
import re
from functools import reduce

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, HBox, VBox, Layout, AppLayout, Button, GridspecLayout


# Consistent color palettes
gender_palette = {'Male': '#1f77b4', 'Female': '#ff69b4', 'Other': 'grey'}
image_type_palette_viz = {'Real': 'skyblue', 'AI': 'salmon', 'Unknown': 'lightgrey'}
gender_palette_plotly = {'Male': 'blue', 'Female': 'magenta', 'Other': 'grey'} # <--- ADD THIS LINE
image_type_palette_plotly = {'Real': 'cornflowerblue', 'AI': 'tomato', 'Unknown': 'lightgrey'} # <--- ADDED for consistency


# Helper for x-tick labels (if still needed for bar plots per AOI)
def format_aoi_tick_labels(ax, aoi_to_imagetype_map):
    tick_labels_objs = ax.get_xticklabels()
    new_tick_labels_text = []
    for label_obj in tick_labels_objs:
        aoi_name = label_obj.get_text()
        image_type = aoi_to_imagetype_map.get(aoi_name, 'Unknown')
        new_label_text = f"{aoi_name}\n({image_type})"
        new_tick_labels_text.append(new_label_text)
    ax.set_xticklabels(new_tick_labels_text, rotation=45, ha='right', fontsize=9)
    plt.subplots_adjust(bottom=0.3)

In [14]:
# --- Summary Statistics Tables ---

metric_cols_for_summary = ['Tot Fixation dur', 'Fixation count', 'Time to first Fixation', 'Tot Visit dur']


for q_name in questions_config.keys():
    merged_long_df_qN = all_merged_long_dfs.get(q_name)

    if merged_long_df_qN is None or merged_long_df_qN.empty:
        print(f"Skipping summary table for {q_name} as data is empty.")
        continue

    valid_metric_cols = [col for col in metric_cols_for_summary if col in merged_long_df_qN.columns]
    if not valid_metric_cols:
        print(f"No valid metric columns for summary table in {q_name}. Skipping.")
        continue

    # Group by Gender, Image_Type, and AOI for per-question detailed summary
    summary_df_qN_detailed = merged_long_df_qN.groupby(['Gender', 'Image_Type', 'AOI'], as_index=False)[valid_metric_cols].agg(['mean', 'std', 'count'])

    # Flatten multi-index columns
    summary_df_qN_detailed.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in summary_df_qN_detailed.columns.values]
    summary_df_qN_detailed = summary_df_qN_detailed.reset_index() # Bring Gender, Image_Type, AOI back as columns




    # Simpler summary (mean only) by Gender and Image_Type (across AOIs for that question)
    summary_df_qN_simple = merged_long_df_qN.groupby(['Gender', 'Image_Type'], as_index=False)[valid_metric_cols].mean()
    rename_dict_simple = {col: f'Avg_{col}' for col in valid_metric_cols}
    summary_df_qN_simple = summary_df_qN_simple.rename(columns=rename_dict_simple)

In [18]:
# Eye-Tracking Data Dashboard - Optimized for Voila
# Modern, responsive design with enhanced UI/UX

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import ipywidgets as widgets
from ipywidgets import interact, interactive, HBox, VBox, Layout, HTML, Tab
from IPython.display import display, clear_output
import warnings
warnings.filterwarnings('ignore')

# --- STYLING AND CONFIG ---
plt.style.use('dark_background')
sns.set_palette("husl")
MODERN_COLORS = {
    'primary': '#00D4FF', 'secondary': '#FF6B6B', 'accent': '#4ECDC4', 'dark': '#1A1A2E',
    'light': '#16213E', 'success': '#00F5A0', 'warning': '#FFD93D', 'text': '#FFFFFF'
}
gender_palette = {'Male': MODERN_COLORS['primary'], 'Female': MODERN_COLORS['secondary'], 
                 'M': MODERN_COLORS['primary'], 'F': MODERN_COLORS['secondary']}
widget_style = {'description_width': 'auto'}
widget_layout = Layout(width='auto')

### FIX: Comprehensive Dropdown Styling ###
dropdown_style = """
<style>
/* Style for the visible, un-clicked dropdown box */
.widget-dropdown select {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white; /* Color of the selected item's text */
    border: 2px solid #00D4FF;
    border-radius: 8px;
    padding: 8px 12px;
    font-weight: 500;
    min-height: 40px; /* Ensure consistent height */
}
/* Style for the dropdown's label (e.g., "Select Metric:") */
.widget-dropdown .widget-label {
    color: #00D4FF;
    font-weight: 600;
    font-size: 14px;
    margin-right: 10px;
}
/* FIX: Style for the OPTIONS that appear when you click the dropdown */
.widget-dropdown select option {
    background-color: white;
    color: #1A1A2E; /* A dark color for visible text */
    padding: 5px;
}

/* Fix to bring the main dropdown to the front of other elements */
.main-question-dropdown {
    position: relative; 
    z-index: 999;
}
</style>
"""

# --- HELPER FUNCTIONS for UI elements (No changes here) ---
# NOTE: To save space, I've collapsed the content of these functions. Your original code is fine.
def create_dashboard_header():
    return HTML(f"""<div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 30px; border-radius: 15px; margin-bottom: 25px; box-shadow: 0 10px 30px rgba(0, 0, 0, 0.3); text-align: center;'><h1 style='color: white; font-size: 2.5em; margin: 0 0 10px 0; font-weight: 700; text-shadow: 2px 2px 4px rgba(0,0,0,0.3);'>🧠 Eye-Tracking Analytics Dashboard</h1><p style='color: rgba(255,255,255,0.9); font-size: 1.2em; margin: 0; font-weight: 300;'>Advanced Visual Analytics & Data Exploration Platform</p><div style='margin-top: 15px; padding: 10px; background: rgba(255,255,255,0.1); border-radius: 25px; display: inline-block;'><span style='color: {MODERN_COLORS["success"]}; font-weight: 600;'>●</span><span style='color: white; margin-left: 8px;'>Real-time Interactive Visualization</span></div></div>""")
def create_section_divider(title, icon="📊"):
    return HTML(f"""<div style='margin: 30px 0 20px 0; padding: 15px; background: linear-gradient(90deg, rgba(0,212,255,0.1) 0%, rgba(102,126,234,0.1) 100%); border-left: 4px solid {MODERN_COLORS["primary"]}; border-radius: 8px;'><h3 style='color: {MODERN_COLORS["primary"]}; margin: 0; font-size: 1.4em; font-weight: 600;'>{icon} {title}</h3></div>""")
def create_loading_widget():
    return HTML(f"""<div style='text-align: center; padding: 40px; background: linear-gradient(135deg, rgba(0,212,255,0.1) 0%, rgba(255,107,107,0.1) 100%); border-radius: 12px; margin: 20px 0;'><div style='display: inline-block; width: 40px; height: 40px; border: 4px solid rgba(0,212,255,0.3); border-radius: 50%; border-top-color: {MODERN_COLORS["primary"]}; animation: spin 1s ease-in-out infinite;'></div><p style='color: {MODERN_COLORS["primary"]}; margin-top: 15px; font-weight: 500;'>Generating Visualizations...</p></div><style>@keyframes spin {{ to {{ transform: rotate(360deg); }} }}</style>""")
def create_error_display(message):
    return HTML(f"""<div style='background: linear-gradient(135deg, rgba(255,107,107,0.1) 0%, rgba(255,107,107,0.2) 100%); border: 2px solid {MODERN_COLORS["secondary"]}; border-radius: 12px; padding: 20px; margin: 20px 0; text-align: center;'><h4 style='color: {MODERN_COLORS["secondary"]}; margin: 0 0 10px 0;'>⚠️ No Data Available</h4><p style='color: white; margin: 0;'>{message}</p></div>""")

# Enhanced plotting functions with modern styling
def create_modern_bar_plot(data, metric, agg_func, plot_title_suffix): # For specific questions
    aoi_gender_summary = data.groupby(['Gender', 'AOI', 'Image_Type'], as_index=False).agg({metric: agg_func}).reset_index(drop=True).sort_values(by=['AOI', 'Gender'])
    fig = px.bar(aoi_gender_summary, x='AOI', y=metric, color='Gender', color_discrete_map=gender_palette, title=f'{metric} ({agg_func.capitalize()}) per AOI by Gender {plot_title_suffix}', height=500, barmode='group')
    fig.update_layout(plot_bgcolor='rgba(0,0,0,0.8)', paper_bgcolor='rgba(0,0,0,0.8)', font_color='white', title_font_size=18, title_x=0.5, xaxis=dict(gridcolor='rgba(255,255,255,0.1)'), yaxis=dict(gridcolor='rgba(255,255,255,0.1)'), legend=dict(bgcolor='rgba(0,0,0,0.5)', bordercolor='rgba(255,255,255,0.2)', borderwidth=1))
    return fig

def create_modern_scatter_plot(data, fixation_dur_col, fixation_count_col, plot_title_suffix):
    valid_data = data.dropna(subset=[fixation_dur_col, fixation_count_col])
    if valid_data.empty: return None
    fig = px.scatter(valid_data, x=fixation_dur_col, y=fixation_count_col, color='Gender', symbol='Image_Type', title=f'Interactive Scatter: {fixation_count_col} vs {fixation_dur_col} {plot_title_suffix}', hover_data=['Participant_ID', 'AOI'], color_discrete_map=gender_palette, height=600)
    fig.add_traces(px.scatter(valid_data, x=fixation_dur_col, y=fixation_count_col, trendline="ols", color_discrete_sequence=['rgba(255,255,255,0.5)']).data)
    fig.update_layout(plot_bgcolor='rgba(0,0,0,0.8)', paper_bgcolor='rgba(0,0,0,0.8)', font_color='white', title_font_size=18, title_x=0.5, xaxis=dict(gridcolor='rgba(255,255,255,0.1)'), yaxis=dict(gridcolor='rgba(255,255,255,0.1)'), legend=dict(bgcolor='rgba(0,0,0,0.5)', bordercolor='rgba(255,255,255,0.2)', borderwidth=1))
    return fig

def create_combined_bar_plot(data, metric, agg_func, plot_title_suffix): # For "All Combined"
    summary = data.groupby(['Image_Type', 'Gender'], as_index=False).agg({metric: agg_func}).reset_index(drop=True)
    fig = px.bar(summary, x='Image_Type', y=metric, color='Gender', color_discrete_map=gender_palette, title=f'{metric} ({agg_func.capitalize()}) by Image Type {plot_title_suffix}', height=500, barmode='group')
    fig.update_layout(plot_bgcolor='rgba(0,0,0,0.8)', paper_bgcolor='rgba(0,0,0,0.8)', font_color='white', title_font_size=18, title_x=0.5, xaxis=dict(title_text='Image Type', gridcolor='rgba(255,255,255,0.1)'), yaxis=dict(gridcolor='rgba(255,255,255,0.1)'), legend=dict(bgcolor='rgba(0,0,0,0.5)', bordercolor='rgba(255,255,255,0.2)', borderwidth=1))
    return fig

def _create_4_panel_dashboard(data, selected_metric, plot_title_suffix):
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(f'{selected_metric} by Image Type & Gender', f'{selected_metric} Violin Plot', f'"{selected_metric}" Distribution by Gender', 'Summary Statistics'),
        specs=[[{"type": "box"}, {"type": "violin"}], [{"type": "histogram"}, {"type": "table"}]]
    )
    
    # --- Panel 1: Robust Box Plot ---
    if 'Image_Type' in data.columns and 'Gender' in data.columns:
        # Ideal case: Grouped by Image Type, colored by Gender
        for gender in sorted([g for g in data['Gender'].unique() if pd.notna(g)]):
            subset = data[data['Gender'] == gender]
            fig.add_trace(go.Box(
                y=subset[selected_metric], x=subset['Image_Type'], name=gender,
                marker_color=gender_palette.get(gender), legendgroup=gender, showlegend=True, boxpoints='outliers'
            ), row=1, col=1)
        fig.update_layout(boxmode='group', xaxis1_title='Image Type')
    else:
        # Fallback case: A single box plot for the entire metric
        fig.add_trace(go.Box(y=data[selected_metric], name=selected_metric, showlegend=False), row=1, col=1)
        
    # --- Panel 2: Robust Violin Plot ---
    if 'Image_Type' in data.columns and 'Gender' in data.columns:
        # Ideal case: Split violin plot
        image_types = sorted([it for it in data['Image_Type'].unique() if pd.notna(it)])
        for img_type in image_types:
            for gender in ['Male', 'Female']:
                subset = data[(data['Image_Type'] == img_type) & (data['Gender'] == gender)]
                if not subset.empty:
                    fig.add_trace(go.Violin(
                        y=subset[selected_metric], name=gender, x0=str(img_type),
                        side='negative' if gender == 'Male' else 'positive',
                        marker_color=gender_palette.get(gender), points=False,
                        legendgroup=gender, showlegend=False
                    ), row=1, col=2)
        fig.update_traces(meanline_visible=True, row=1, col=2)
        fig.update_layout(violinmode='overlay', xaxis2_title='Image Type')
    else:
        # Fallback case: A single violin plot
        fig.add_trace(go.Violin(y=data[selected_metric], name=selected_metric, showlegend=False), row=1, col=2)
        
    # --- Panel 3: Robust Histogram ---
    if 'Gender' in data.columns:
        # Ideal case: Overlapping histograms by gender
        for gender in sorted([g for g in data['Gender'].unique() if pd.notna(g)]):
            subset = data[data['Gender'] == gender]
            fig.add_trace(go.Histogram(
                x=subset[selected_metric], name=gender, marker_color=gender_palette.get(gender),
                legendgroup=gender, showlegend=False, opacity=0.7, nbinsx=25
            ), row=2, col=1)
        fig.update_layout(barmode='overlay')
    else:
        # Fallback case: A single histogram
        fig.add_trace(go.Histogram(x=data[selected_metric], name='Distribution', showlegend=False), row=2, col=1)

    # --- Panel 4: Robust Summary Table ---
    try:
        summary_stats = data.groupby(['Image_Type', 'Gender'])[selected_metric].agg(['count', 'mean', 'std', 'min', 'max']).round(2).reset_index()
        fig.add_trace(go.Table(
            header=dict(
                values=[f'<b>{c.upper()}</b>' for c in summary_stats.columns],
                fill_color=MODERN_COLORS['primary'], font_color='white', align='center',
                font=dict(size=6) # Smaller header font
            ),
            cells=dict(
                values=[summary_stats[c] for c in summary_stats.columns],
                fill_color='rgba(0,0,0,0.8)', font_color='white', align='center',
                font=dict(size=10) # Smaller cell font
            )
        ), row=2, col=2)
    except Exception:
        # Fallback table can remain as is
        simple_stats = data[selected_metric].describe().round(2).reset_index()
        fig.add_trace(go.Table(
            header=dict(values=['Statistic', 'Value'], fill_color=MODERN_COLORS['primary'], font_color='white'),
            cells=dict(values=[simple_stats['index'], simple_stats[selected_metric]], fill_color='rgba(0,0,0,0.8)', font_color='white')
        ), row=2, col=2)

    # --- Final Layout Updates ---
    fig.update_layout(
        height=850, 
        plot_bgcolor='rgba(0,0,0,0.8)', 
        paper_bgcolor='rgba(0,0,0,0.8)',
        font_color='white', 
        title_text=f'"{selected_metric}" - Analysis Dashboard {plot_title_suffix}',
        title_x=0.5, 
        title_font_size=22,
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
    )
    fig.update_xaxes(title_text=selected_metric, row=2, col=1)
    fig.update_yaxes(title_text="Frequency", row=2, col=1)
    return fig

def _create_correlation_heatmap(data, numeric_metrics, plot_title_suffix):
    """(Helper) Creates the correlation heatmap figure, with one subplot per gender."""
    genders_present = sorted([g for g in data['Gender'].dropna().unique() if pd.notna(g)])
    if not genders_present or len(numeric_metrics) < 2:
        return None

    ncols = min(2, len(genders_present))
    nrows = (len(genders_present) + ncols - 1) // ncols
    fig = make_subplots(
        rows=nrows, cols=ncols,
        subplot_titles=[f"Metric Correlation ({gender})" for gender in genders_present],
        vertical_spacing=0.2
    )

    for i, gender in enumerate(genders_present):
        row, col = i // ncols + 1, i % ncols + 1
        subset_corr = data[data['Gender'] == gender][numeric_metrics]
        if subset_corr.shape[0] > 1:
            corr_matrix = subset_corr.corr()
            fig.add_trace(go.Heatmap(
                z=corr_matrix.values, x=corr_matrix.columns, y=corr_matrix.columns,
                colorscale='RdBu_r', 
                zmin=-1, zmax=1, text=corr_matrix.values,
                texttemplate="%{text:.2f}", textfont={"size":9},
                ### FIX: Add a clean hovertemplate and remove the redundant extra info ###
                hovertemplate='Metric 1: %{y}<br>Metric 2: %{x}<br>Correlation: %{z:.2f}<extra></extra>'
            ), row=row, col=col)
    
    fig.update_layout(
        height=450 * nrows, plot_bgcolor='rgba(0,0,0,0.8)', paper_bgcolor='rgba(0,0,0,0.8)',
        font_color='white', title_text=f"Correlation Heatmaps by Gender {plot_title_suffix}",
        title_x=0.5, title_font_size=20
    )
    return fig

def create_comparison_dashboard(data, selected_metric, numeric_metrics, plot_title_suffix):
    dashboard_fig, heatmap_fig = None, None
    try:
        if selected_metric not in data.columns: return None, None
        clean_data = data.dropna(subset=[selected_metric])
        if clean_data.empty: return None, None
        dashboard_fig = _create_4_panel_dashboard(clean_data, selected_metric, plot_title_suffix)
        heatmap_fig = _create_correlation_heatmap(clean_data, numeric_metrics, plot_title_suffix)
    except Exception as e:
        print(f"Error in dashboard creation: {e}")
    return dashboard_fig, heatmap_fig

# --- MODULAR UI BUILDER FUNCTIONS ---

def build_bar_chart_section(df, metric_options, question_choice):
    divider = create_section_divider("Interactive Bar Chart Analysis", "📊")
    if (question_choice == 'All Combined' and 'Image_Type' not in df.columns) or \
       (question_choice != 'All Combined' and 'AOI' not in df.columns):
        return VBox([divider, create_error_display("Required columns not found for this plot.")], layout=Layout(width='100%'))

    dd_metric = widgets.Dropdown(options=metric_options, description='Select Metric:', style=widget_style, layout=Layout(min_width='350px'))
    output_area = widgets.Output()
    def update_plot(metric_to_plot):
        with output_area:
            clear_output(wait=True)
            agg_func = 'mean' if 'Time to first Fixation' in metric_to_plot else 'sum'
            plot_title_suffix = f"({question_choice})"
            if question_choice != 'All Combined': fig = create_modern_bar_plot(df, metric_to_plot, agg_func, plot_title_suffix)
            else: fig = create_combined_bar_plot(df, metric_to_plot, agg_func, plot_title_suffix)
            display(fig)
    def handle_change(change): update_plot(change.new)
    dd_metric.observe(handle_change, names='value')
    update_plot(dd_metric.value)
    return VBox([divider, HBox([dd_metric]), output_area], layout=Layout(width='100%'))

def build_scatter_section(df, plot_title_suffix):
    divider = create_section_divider("Correlation Scatter Analysis", "🔍")
    output_area = widgets.Output()
    with output_area:
        fix_dur_col, fix_count_col = 'Tot Fixation dur', 'Fixation count'
        if fix_dur_col in df.columns and fix_count_col in df.columns:
            fig = create_modern_scatter_plot(df, fix_dur_col, fix_count_col, plot_title_suffix)
            if fig: display(fig)
            else: display(create_error_display("No valid data points for scatter plot."))
        else: display(create_error_display("Required columns for scatter plot not found."))
    return VBox([divider, output_area], layout=Layout(width='100%'))

def build_comparison_section(df, metric_options, question_choice):
    divider = create_section_divider("Multi-Dimensional Analysis & Heatmaps", "📈")
    dd_metric = widgets.Dropdown(options=metric_options, description='Select Metric:', style=widget_style, layout=Layout(min_width='350px'))
    output_area = widgets.Output()
    def update_plot(metric_to_plot):
        with output_area:
            clear_output(wait=True)
            numeric_metrics_corr = [m for m in metric_options if pd.api.types.is_numeric_dtype(df[m])]
            plot_title_suffix = f"({question_choice})"
            dashboard_fig, heatmap_fig = create_comparison_dashboard(df, metric_to_plot, numeric_metrics_corr, plot_title_suffix)
            if dashboard_fig: display(dashboard_fig)
            if heatmap_fig: display(heatmap_fig)
    def handle_change(change): update_plot(change.new)
    dd_metric.observe(handle_change, names='value')
    update_plot(dd_metric.value)
    return VBox([divider, HBox([dd_metric]), output_area], layout=Layout(width='100%'))

# --- MAIN DASHBOARD ORCHESTRATOR ---
dashboard_body = VBox()

def update_dashboard_layout(change):
    global dashboard_body
    question_choice = change['new']
    dashboard_body.children = [create_loading_widget()]
    try:
        if question_choice == 'All Combined': df_to_plot = final_combined_long_df
        else: df_to_plot = all_merged_long_dfs.get(question_choice)
        metric_options = [m for m in selected_metric_sheets if m in df_to_plot.columns]
    except (NameError, KeyError):
        dashboard_body.children = [create_error_display(f"Could not load data for {question_choice}")]
        return
    
    bar_section = build_bar_chart_section(df_to_plot, metric_options, question_choice)
    scatter_section = build_scatter_section(df_to_plot, f"({question_choice})")
    comparison_section = build_comparison_section(df_to_plot, metric_options, question_choice)
    
    # Update the children of the *already displayed* VBox
    dashboard_body.children = [bar_section, scatter_section, comparison_section]

# --- DASHBOARD INITIALIZATION ---
def initialize_dashboard():
    display(HTML(dropdown_style))
    display(create_dashboard_header())
    display(HTML("""...""")) # Control Panel Header
    
    try:
        question_options = ['All Combined'] + list(questions_config.keys())
    except NameError:
        question_options = ['All Combined', 'Q1', 'Q2', 'Q3']
    
    dd_question = widgets.Dropdown(options=question_options, description='📋 Select Question Set:', style={'description_width': 'initial'}, layout=Layout(width='50%', margin='0 auto 20px auto'))
    dd_question.add_class("main-question-dropdown")
    display(dd_question)

    global dashboard_body
    dashboard_body.layout.width = '100%'
    
    dd_question.observe(update_dashboard_layout, names='value')
    
    # Display the main container just once. It will be updated by the handler.
    display(dashboard_body)
    
    # Manually trigger the first update
    update_dashboard_layout({'new': dd_question.value})

# --- RUN THE DASHBOARD ---
# It is CRITICAL to do a "Restart and run all" after pasting this code.
initialize_dashboard()

HTML(value='\n<style>\n/* Style for the visible, un-clicked dropdown box */\n.widget-dropdown select {\n    ba…

HTML(value="<div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 30px; border-r…

HTML(value='...')

Dropdown(_dom_classes=('main-question-dropdown',), description='📋 Select Question Set:', layout=Layout(margin=…

VBox(layout=Layout(width='100%'))