In [19]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

def read_user_csv_files(root_folder):
    """
    Read stage1_results.csv and stage2_results.csv from each user's folder.

    Args:
        root_folder (str): The root folder containing user folders.

    Returns:
        dict: A dictionary where keys are user IDs, and values are DataFrames
              for stage1 and stage2 results.
    """
    user_data = {}
    for user_id in os.listdir(root_folder):
        user_folder = os.path.join(root_folder, user_id)
        if os.path.isdir(user_folder):
            stage1_file = os.path.join(user_folder, "stage1_results.csv")
            stage2_file = os.path.join(user_folder, "stage2_results.csv")
            if os.path.exists(stage1_file) and os.path.exists(stage2_file):
                # Read the CSV files into DataFrames
                stage1_df = pd.read_csv(stage1_file)
                stage2_df = pd.read_csv(stage2_file)
                user_data[user_id] = {
                    "stage1": stage1_df,
                    "stage2": stage2_df,
                }
    return user_data

def filter_below_quartile(df):
    """
    Filter out rows where the Similarity is below the lower quartile.

    Args:
        df (pd.DataFrame): The DataFrame containing a "Similarity" column.

    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    lower_quartile = df["Similarity"].quantile(0.25)
    return df[df["Similarity"] >= lower_quartile]

def filter_above_50(df):
    return df[df["Similarity"] > 50]

def calculate_similarity_distribution(df):
    """
    Calculate the similarity distribution for specified intervals (50%-100% in 5% bins).

    Args:
        df (pd.DataFrame): The DataFrame containing a "Similarity" column.

    Returns:
        dict: Distribution of similarity in 5% bins starting from 50%.
    """
    bins = list(range(50, 105, 5))  # Bins from 50 to 100 (inclusive)
    labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]
    distribution = pd.cut(df["Similarity"], bins=bins, labels=labels, right=False).value_counts(normalize=True)
    return {interval: round(distribution.get(interval, 0) * 100, 2) for interval in labels}

def calculate_average_similarity(df):
    """
    Calculate the average similarity of the filtered data.

    Args:
        df (pd.DataFrame): The DataFrame containing a "Similarity" column.

    Returns:
        float: The average similarity.
    """
    return df["Similarity"].mean()

def compare_stages(stage1_df, stage2_df):
    """
    Compare stage1 and stage2 results to evaluate performance improvements.

    Args:
        stage1_df (pd.DataFrame): DataFrame for stage1.
        stage2_df (pd.DataFrame): DataFrame for stage2.

    Returns:
        dict: Metrics for each stage and comparisons between stages.
    """
    # Filter out rows where similarity <= 50%
    stage1_filtered = filter_above_50(stage1_df)
    stage2_filtered = filter_above_50(stage2_df)

    # Calculate proportion of similarity > 80% for each stage
    stage1_above_80 = (stage1_filtered["Similarity"] > 80).mean() * 100
    stage2_above_80 = (stage2_filtered["Similarity"] > 80).mean() * 100

    # Calculate average and median similarity for each stage
    stage1_avg = stage1_filtered["Similarity"].mean()
    stage2_avg = stage2_filtered["Similarity"].mean()
    stage1_median = stage1_filtered["Similarity"].median()
    stage2_median = stage2_filtered["Similarity"].median()

    # Calculate similarity distribution for each stage (50%-100% in 5% bins)
    stage1_distribution = calculate_similarity_distribution(stage1_filtered)
    stage2_distribution = calculate_similarity_distribution(stage2_filtered)

    # Compare stages based on median and average
    avg_difference = stage2_avg - stage1_avg
    median_difference = stage2_median - stage1_median

    return {
        "stage1_above_80": stage1_above_80,
        "stage2_above_80": stage2_above_80,
        "stage1_avg": stage1_avg,
        "stage2_avg": stage2_avg,
        "stage1_median": stage1_median,
        "stage2_median": stage2_median,
        "stage1_distribution": stage1_distribution,
        "stage2_distribution": stage2_distribution,
        "avg_difference": avg_difference,
        "median_difference": median_difference,
    }

def perform_significance_test(stage1_df, stage2_df):
    """
    Perform a t-test to compare the similarity distributions of stage1 and stage2.

    Args:
        stage1_df (pd.DataFrame): DataFrame for stage1.
        stage2_df (pd.DataFrame): DataFrame for stage2.

    Returns:
        dict: Results of the t-test, including p-value and significance interpretation.
    """
    stage1_similarity = stage1_df["Similarity"]
    stage2_similarity = stage2_df["Similarity"]

    # Perform two-sided t-test
    t_stat, p_value = ttest_ind(stage1_similarity, stage2_similarity, equal_var=False)

    # Interpret the result
    significance = "Significant" if p_value < 0.05 else "Not Significant"

    return {
        "t_statistic": t_stat,
        "p_value": p_value,
        "significance": significance,
    }

def calculate_variance(stage1_df, stage2_df):
    """
    Calculate the variance (or standard deviation) of similarity for each stage.

    Args:
        stage1_df (pd.DataFrame): DataFrame for stage1.
        stage2_df (pd.DataFrame): DataFrame for stage2.

    Returns:
        dict: Variance and standard deviation for each stage.
    """
    stage1_variance = stage1_df["Similarity"].var()
    stage2_variance = stage2_df["Similarity"].var()
    stage1_std = stage1_df["Similarity"].std()
    stage2_std = stage2_df["Similarity"].std()

    return {
        "stage1_variance": stage1_variance,
        "stage2_variance": stage2_variance,
        "stage1_std": stage1_std,
        "stage2_std": stage2_std,
    }

def plot_similarity_distribution(user_id, stage1_distribution, stage2_distribution, output_folder):
    """
    Plot and save a bar chart comparing similarity distributions for stage1 and stage2.

    Args:
        user_id (str): User ID for labeling.
        stage1_distribution (dict): Similarity distribution for stage1.
        stage2_distribution (dict): Similarity distribution for stage2.
        output_folder (str): Folder to save the output chart.
    """
    # Prepare data for plotting
    bins = list(stage1_distribution.keys())  # Bin labels (e.g., "50-55", "55-60", ...)
    stage1_values = [stage1_distribution[bin] for bin in bins]
    stage2_values = [stage2_distribution[bin] for bin in bins]

    x = range(len(bins))  # X positions for bars

    # Create bar chart
    plt.figure(figsize=(12, 6))
    bar_width = 0.35  # Width of the bars

    plt.bar(x, stage1_values, width=bar_width, label='Stage 1', color='blue', alpha=0.7)
    plt.bar([p + bar_width for p in x], stage2_values, width=bar_width, label='Stage 2', color='orange', alpha=0.7)

    plt.xlabel("Similarity Range (%)", fontsize=12)
    plt.ylabel("Percentage (%)", fontsize=12)
    plt.title(f"Similarity Distribution Comparison for User: {user_id}", fontsize=14)
    plt.xticks([p + bar_width / 2 for p in x], bins, rotation=45)
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Save the plot
    output_path = os.path.join(output_folder, f"{user_id}_similarity_distribution.png")
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()

    print(f"Similarity distribution plot saved for user {user_id} at {output_path}")

def generate_user_report(user_id, comparison_metrics, significance_results, variance_results):
    """
    Generate a summary report for a user, including detailed stage comparisons.

    Args:
        user_id (str): The user ID.
        comparison_metrics (dict): Comparison metrics from compare_stages.
        significance_results (dict): Results of significance testing.
        variance_results (dict): Variance and standard deviation for each stage.

    Returns:
        str: The summary report as a string.
    """
    report = f"User: {user_id}\n"
    report += f"Stage 1 - Average Similarity: {comparison_metrics['stage1_avg']:.2f}%\n"
    report += f"Stage 2 - Average Similarity: {comparison_metrics['stage2_avg']:.2f}%\n"
    report += f"Stage 1 - Median Similarity: {comparison_metrics['stage1_median']:.2f}%\n"
    report += f"Stage 2 - Median Similarity: {comparison_metrics['stage2_median']:.2f}%\n"
    report += f"\nPercentage of Time Similarity > 80%:\n"
    report += f"- Stage 1: {comparison_metrics['stage1_above_80']:.2f}%\n"
    report += f"- Stage 2: {comparison_metrics['stage2_above_80']:.2f}%\n"
    report += f"\nSimilarity Distribution (in 5% bins):\n"
    report += f"Stage 1: {comparison_metrics['stage1_distribution']}\n"
    report += f"Stage 2: {comparison_metrics['stage2_distribution']}\n"
    report += f"\nPerformance Improvements:\n"
    report += f"- Average Similarity Difference (Stage 2 - Stage 1): {comparison_metrics['avg_difference']:.2f}%\n"
    report += f"- Median Similarity Difference (Stage 2 - Stage 1): {comparison_metrics['median_difference']:.2f}%\n"
    report += f"\nVariance and Standard Deviation:\n"
    report += f"- Stage 1 Variance: {variance_results['stage1_variance']:.2f}\n"
    report += f"- Stage 2 Variance: {variance_results['stage2_variance']:.2f}\n"
    report += f"- Stage 1 Standard Deviation: {variance_results['stage1_std']:.2f}\n"
    report += f"- Stage 2 Standard Deviation: {variance_results['stage2_std']:.2f}\n"
    report += f"\nSignificance Testing:\n"
    report += f"- T-Statistic: {significance_results['t_statistic']:.2f}\n"
    report += f"- P-Value: {significance_results['p_value']:.6f}\n"
    report += f"- Result: {significance_results['significance']}\n"
    return report

def analyze_users(root_folder):
    """
    Main function to analyze all users and generate reports.

    Args:
        root_folder (str): The root folder containing user folders.

    Returns:
        dict: A dictionary of user reports.
    """
    # Read all user data
    user_data = read_user_csv_files(root_folder)

    # Analyze each user
    user_reports = {}
    for user_id, stages in user_data.items():
        print(f"Analyzing user: {user_id}...")

        # Filter data
        stage1_filtered = filter_below_quartile(stages["stage1"])
        stage2_filtered = filter_below_quartile(stages["stage2"])

        # Calculate average similarity
        stage1_avg = calculate_average_similarity(stage1_filtered)
        stage2_avg = calculate_average_similarity(stage2_filtered)

        # Compare stage1 and stage2
        comparison_metrics = compare_stages(stages["stage1"], stages["stage2"])

        # Generate report
        report = generate_user_report(user_id, comparison_metrics)
        user_reports[user_id] = report

        # Print report to console
        print(report)

    return user_reports

def analyze_users_with_plots(root_folder):
    """
    Analyze all users, generate reports, plots, and perform statistical tests.

    Args:
        root_folder (str): The root folder containing user folders.
    """
    user_data = read_user_csv_files(root_folder)

    for user_id, stages in user_data.items():
        print(f"\nAnalyzing user: {user_id}...")

        # Filter data
        stage1_filtered = filter_above_50(stages["stage1"])
        stage2_filtered = filter_above_50(stages["stage2"])

        # Compare stages
        comparison_metrics = compare_stages(stages["stage1"], stages["stage2"])

        # Perform statistical tests
        significance_results = perform_significance_test(stage1_filtered, stage2_filtered)
        variance_results = calculate_variance(stage1_filtered, stage2_filtered)

        # Generate similarity distribution plot
        plot_similarity_distribution(
            user_id,
            comparison_metrics["stage1_distribution"],
            comparison_metrics["stage2_distribution"],
            root_folder,
        )

        # Generate and save report
        report = generate_user_report(user_id, comparison_metrics, significance_results, variance_results)
        report_path = os.path.join(root_folder, f"{user_id}_analysis_report.txt")
        with open(report_path, "w") as f:
            f.write(report)
        print(f"Report saved for user {user_id} at {report_path}")


In [21]:
# Example usage
if __name__ == "__main__":
    root_folder = "oculus2"
    # root_folder = "user_test_oculus3/user_test_oculus3/files"  # Root folder containing user folders
    analyze_users_with_plots(root_folder)
    # user_reports = analyze_users(root_folder)

    # # Optionally save each report to a text file
    # for user_id, report in user_reports.items():
    #     report_file = os.path.join(root_folder, f"{user_id}_analysis_report.txt")
    #     with open(report_file, "w") as f:
    #         f.write(report)
    #     print(f"Report saved for user {user_id} at {report_file}")


Analyzing user: 10086yly...
Similarity distribution plot saved for user 10086yly at oculus2\10086yly_similarity_distribution.png
Report saved for user 10086yly at oculus2\10086yly_analysis_report.txt

Analyzing user: 101...
Similarity distribution plot saved for user 101 at oculus2\101_similarity_distribution.png
Report saved for user 101 at oculus2\101_analysis_report.txt

Analyzing user: 102...
Similarity distribution plot saved for user 102 at oculus2\102_similarity_distribution.png
Report saved for user 102 at oculus2\102_analysis_report.txt

Analyzing user: 723lyz...
Similarity distribution plot saved for user 723lyz at oculus2\723lyz_similarity_distribution.png
Report saved for user 723lyz at oculus2\723lyz_analysis_report.txt
