In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from ipywidgets import VBox, Checkbox, Button, Output, Dropdown
from IPython.display import display
import numpy as np

# Load your data
file_path = 'updated_scores_inverted_performance.csv'  # Replace with your CSV file path
data = pd.read_csv(file_path)

# Convert 'Maze Score' from percentage to numeric if it exists
if 'Maze score' in data.columns:
    data['Maze score'] = data['Maze score'].str.replace('%', '')  # Remove percentage sign
    data['Maze score'] = data['Maze score'].str.strip()  # Remove any leading/trailing whitespace
    data['Maze score'] = pd.to_numeric(data['Maze score'], errors='coerce') # Convert to float and normalize

# Check the data types to ensure the conversion was successful
print("Data types after conversion:\n", data.dtypes)

# Define the column names
columns = data.columns

# Create checkboxes for each column (for dependent variables)
checkboxes = [Checkbox(value=False, description=col) for col in columns]

# Create a dropdown for selecting the independent variable
independent_dropdown = Dropdown(
    options=columns,
    description='Independent Var:',
    disabled=False
)

# Create a dropdown for selecting the dependent variable for the box plot
dependent_dropdown = Dropdown(
    options=[col for col in columns],  # Include all columns, including 'Maze score'
    description='Select Box Plot Dependent Var:',
    disabled=False
)

# Create a "Run Analysis" button
run_button = Button(description="Run Analysis")
output = Output()  # Define the output widget

# Define what happens when the button is clicked
def on_button_clicked(b):
    with output:
        output.clear_output()  # Clear previous outputs

        # Get the selected independent variable
        independent_var = independent_dropdown.value

        # Get the selected dependent variables for OLS
        dependent_vars = [checkbox.description for checkbox in checkboxes if checkbox.value]

        # Check if the independent variable is numeric
        if independent_var not in data.columns or data[independent_var].dtype not in ['float64', 'int64']:
            print(f"The selected independent variable '{independent_var}' is not numeric.")
            return
        
        # Check if any dependent variables are selected
        if not dependent_vars:
            print("No dependent variables selected. Please select at least one variable.")
            return
        
        # Initialize a list to collect OLS regression results
        regression_results = []

        # Prepare for OLS regression for each selected dependent variable
        for dependent_var in dependent_vars:
            # Check if the dependent variable is numeric
            if dependent_var not in data.columns or data[dependent_var].dtype not in ['float64', 'int64']:
                print(f"The selected dependent variable '{dependent_var}' is not numeric.")
                continue
            
            # Prepare for OLS regression
            X = sm.add_constant(data[[independent_var]])  # Independent variable
            y = data[dependent_var]  # Dependent variable
            
            # Fit the OLS model
            model = sm.OLS(y, X).fit()
            
            # Append results to the list
            conf_int = model.conf_int().loc[independent_var]  # Get confidence intervals
            regression_results.append({
                "Dependent Variable": dependent_var,
                "Coefficient": model.params[independent_var],
                "Standard Error": model.bse[independent_var],
                "t-Value": model.tvalues[independent_var],
                "P>|t|": model.pvalues[independent_var],
                "95% Confidence Interval": f"[{conf_int[0]:.3f}, {conf_int[1]:.3f}]"
            })

        # Create a DataFrame from the regression results
        regression_summary_df = pd.DataFrame(regression_results)

        # Display the consolidated OLS regression results
        print("\nConsolidated OLS Regression Results:")
        print(f"Independent Variable: {independent_var}")
        display(regression_summary_df.style.format({
            "Coefficient": "{:.4f}",
            "Standard Error": "{:.4f}",
            "t-Value": "{:.4f}",
            "P>|t|": "{:.4f}"
        }))

        
        # Save the consolidated OLS regression results to a CSV file
        regression_summary_df.to_csv('consolidated_regression_results.csv', index=False)  # Save results
        
        # Initialize a list to collect summary statistics
        summary_statistics = {}

        # Calculate summary statistics for each dependent variable
        for dependent_var in dependent_vars:
            # Summary statistics for each dependent variable
            summary_stats = data[dependent_var].describe()
            summary_statistics[dependent_var] = summary_stats
        
        # Create a DataFrame from the summary statistics dictionary
        summary_df = pd.DataFrame(summary_statistics).T  # Transpose for better display
        summary_df.columns = ['Count', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max']  # Rename columns
        
        # Display the consolidated summary statistics
        print("\nSummary Statistics of Selected Dependent Variables:")
        display(summary_df.style.format(na_rep='N/A', decimal='.', precision=2))  # Format summary statistics
        
        # Save the consolidated summary statistics to a CSV file
        summary_df.to_csv('consolidated_summary_statistics.csv', index=False)  # Save the consolidated statistics

        # Plot heatmap of correlations
        plt.figure(figsize=(12, 10))
        corr = data[[independent_var] + dependent_vars].corr()  # Include independent variable for correlation
        # Create a mask to display only the lower triangle of the heatmap
        mask = np.triu(np.ones_like(corr, dtype=bool))
        
        # Create a heatmap with a better aesthetic
        heatmap = sns.heatmap(
            corr,
            mask=mask,  # Apply the mask to the heatmap
            annot=True,
            cmap='coolwarm',  # Use a visually appealing color palette
            fmt='.2f',  # Format for correlation coefficients
            linewidths=.5,
            cbar_kws={"shrink": .8},  # Shrink color bar
            annot_kws={"size": 10}  # Font size for annotations
        )

        # Add titles and labels
        heatmap.set_title('Correlation Heatmap of Selected Variables', fontsize=20)
        heatmap.set_xlabel('Variables', fontsize=14)
        heatmap.set_ylabel('Variables', fontsize=14)
        heatmap.tick_params(axis='both', labelsize=12)  # Increase tick label size
        
        # Save the correlation heatmap
        plt.savefig('correlation_heatmap.png', bbox_inches='tight')  # Adjust filename and format as needed
        plt.show()
        
        # Create a box plot for the selected dependent variable
        box_var = dependent_dropdown.value
        if box_var in dependent_vars:  # Ensure the selected box plot variable is valid
            plt.figure(figsize=(10, 6))
            sns.boxplot(x=independent_var, y=box_var, data=data)
            plt.title(f'Box Plot of {box_var} by {independent_var}', fontsize=18)
            plt.xlabel(independent_var, fontsize=14)
            plt.ylabel(box_var, fontsize=14)
            plt.grid(True)  # Add grid for better readability
            plt.xticks(rotation=45)  # Rotate x-axis labels if necessary
            
            # Save the box plot
            plt.savefig(f'box_plot_{box_var}.png', bbox_inches='tight')  # Save box plot with a unique name
            plt.show()

# Assign the event handler to the button
run_button.on_click(on_button_clicked)

# Display the independent variable dropdown, dependent variable dropdown, checkboxes, and the button
display(VBox([independent_dropdown] + checkboxes + [dependent_dropdown, run_button]), output)


Data types after conversion:
 name                    int64
age                     int64
gender                 object
profession             object
gaming experience      object
Participant Number      int64
Mental Demand           int64
Physical Demand         int64
Temporal Demand         int64
Performance             int64
Effort                  int64
Frustration             int64
Overall Score         float64
Delays                  int64
Time Scores           float64
Maze score            float64
dtype: object


VBox(children=(Dropdown(description='Independent Var:', options=('name', 'age', 'gender', 'profession', 'gamin…

Output()