In [6]:
import os
import pandas as pd

def parse_report(report_path):
    """
    Parse a single user's analysis report to extract key metrics.

    Args:
        report_path (str): Path to the analysis report file.

    Returns:
        dict: Parsed data for the user.
    """
    data = {}
    with open(report_path, 'r') as file:
        lines = file.readlines()

    # Initialize defaults
    data['User'] = None
    data['Stage 1 - Average Similarity'] = 0
    data['Stage 2 - Average Similarity'] = 0
    data['Stage 1 - Median Similarity'] = 0
    data['Stage 2 - Median Similarity'] = 0
    data['Stage 1 - % Time Similarity > 80%'] = 0
    data['Stage 2 - % Time Similarity > 80%'] = 0
    data['Average Similarity Difference'] = 0
    data['Median Similarity Difference'] = 0
    data['Stage 1 - Variance'] = 0
    data['Stage 2 - Variance'] = 0
    data['Stage 1 - Standard Deviation'] = 0
    data['Stage 2 - Standard Deviation'] = 0
    data['T-Statistic'] = 0
    data['P-Value'] = 1  # Default to no significance
    data['Significance'] = 'Error'

    # Process each line
    for line in lines:
        line = line.strip()
        if line.startswith("User:"):
            data['User'] = line.split(":")[1].strip()
        elif line.startswith("Stage 1 - Average Similarity:"):
            data['Stage 1 - Average Similarity'] = float(line.split(":")[1].strip().replace('%', ''))
        elif line.startswith("Stage 2 - Average Similarity:"):
            data['Stage 2 - Average Similarity'] = float(line.split(":")[1].strip().replace('%', ''))
        elif line.startswith("Stage 1 - Median Similarity:"):
            data['Stage 1 - Median Similarity'] = float(line.split(":")[1].strip().replace('%', ''))
        elif line.startswith("Stage 2 - Median Similarity:"):
            data['Stage 2 - Median Similarity'] = float(line.split(":")[1].strip().replace('%', ''))
        elif line.startswith("- Stage 1:") and "Percentage of Time Similarity > 80%" in lines:
            data['Stage 1 - % Time Similarity > 80%'] = float(line.split(":")[1].strip().replace('%', ''))
        elif line.startswith("- Stage 2:") and "Percentage of Time Similarity > 80%" in lines:
            data['Stage 2 - % Time Similarity > 80%'] = float(line.split(":")[1].strip().replace('%', ''))
        elif line.startswith("- Average Similarity Difference"):
            data['Average Similarity Difference'] = float(line.split(":")[1].strip().replace('%', ''))
        elif line.startswith("- Median Similarity Difference"):
            data['Median Similarity Difference'] = float(line.split(":")[1].strip().replace('%', ''))
        elif line.startswith("- Stage 1 Variance:"):
            data['Stage 1 - Variance'] = float(line.split(":")[1].strip())
        elif line.startswith("- Stage 2 Variance:"):
            data['Stage 2 - Variance'] = float(line.split(":")[1].strip())
        elif line.startswith("- Stage 1 Standard Deviation:"):
            data['Stage 1 - Standard Deviation'] = float(line.split(":")[1].strip())
        elif line.startswith("- Stage 2 Standard Deviation:"):
            data['Stage 2 - Standard Deviation'] = float(line.split(":")[1].strip())
        elif line.startswith("- T-Statistic:"):
            data['T-Statistic'] = float(line.split(":")[1].strip())
        elif line.startswith("- P-Value:"):
            data['P-Value'] = float(line.split(":")[1].strip())
        elif line.startswith("- Result:"):
            data['Significance'] = line.split(":")[1].strip()

    return data



def aggregate_reports(report_folder, output_csv):
    """
    Aggregate all analysis reports into a single CSV file.

    Args:
        report_folder (str): Path to the folder containing all report files.
        output_csv (str): Path to save the aggregated CSV file.
    """
    all_data = []

    # Loop through all files in the report folder
    for report_file in os.listdir(report_folder):
        if report_file.endswith(".txt"):
            report_path = os.path.join(report_folder, report_file)
            print(f"Processing {report_path}...")
            user_data = parse_report(report_path)
            all_data.append(user_data)

    # Create a DataFrame from the aggregated data
    df = pd.DataFrame(all_data)

    # Save the DataFrame to a CSV file
    df.to_csv(output_csv, index=False)
    print(f"Aggregated data saved to {output_csv}.")

In [7]:
# Example usage
if __name__ == "__main__":
    report_folder = "reports"  # Folder containing all user analysis reports
    output_csv = "aggregated_reports.csv"  # Output CSV file
    aggregate_reports(report_folder, output_csv)

Processing reports\10086yly_analysis_report.txt...
Processing reports\101_analysis_report.txt...
Processing reports\102_analysis_report.txt...
Processing reports\104_analysis_report.txt...
Processing reports\107_analysis_report.txt...
Processing reports\124_analysis_report.txt...
Processing reports\196_analysis_report.txt...
Processing reports\723lyz_analysis_report.txt...
Aggregated data saved to aggregated_reports.csv.
