<a href="https://colab.research.google.com/github/RJAbuNasser/Final-Project/blob/main/Latest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install scikit_posthocs

Collecting scikit_posthocs
  Downloading scikit_posthocs-0.11.2-py3-none-any.whl.metadata (5.8 kB)
Downloading scikit_posthocs-0.11.2-py3-none-any.whl (33 kB)
Installing collected packages: scikit_posthocs
Successfully installed scikit_posthocs-0.11.2


In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import shapiro, wilcoxon, friedmanchisquare
import scikit_posthocs as sp
import zipfile
import os
from scipy.stats import ttest_rel

def plot_data(data, title_prefix):
    fig, axes = plt.subplots(1, 4, figsize=(24, 6))
    sns.histplot(data, bins=20, kde=True, ax=axes[0])
    axes[0].set_title(f'{title_prefix} Histogram - KDE')
    sns.boxplot(data=data, ax=axes[1])
    axes[1].set_title(f'{title_prefix} Boxplot')
    sns.violinplot(data=data, ax=axes[2])
    axes[2].set_title(f'{title_prefix} Violin Plot')
    sns.scatterplot(x=range(len(data)), y=data, ax=axes[3])
    axes[3].set_title(f'{title_prefix} Scatter Plot')
    plt.tight_layout()
    plt.show()
    return fig, axes

def test_normality(data):
    normality_results = {}
    for group, values in data.items():
        all_values = []
        if type(values) is dict:
            for sublist in values.values():
                if type(sublist) in [list, np.ndarray]:
                    all_values.extend(sublist)
                else:
                    all_values.append(sublist)
        else:
            if type(values) in [list, np.ndarray]:
                all_values = values
            else:
                all_values = [values]

        if len(all_values) > 2:
            all_values = np.array(all_values)
            if np.all(np.isfinite(all_values)):
                stat, p_value = shapiro(all_values)
                normality_results[group] = p_value
            else:
                normality_results[group] = None
        else:
            normality_results[group] = None
    return normality_results

def perform_tests(data):
    test_results = {'shapiro': test_normality(data)}
    groups = list(data.keys())
    reference = np.array(data[groups[0]]) if type(data[groups[0]]) in [list, np.ndarray] else None
    if reference is not None and len(reference) > 0:
        wilcoxon_results = {}
        ttest_results = {}
        for group, values in data.items():
            group_values = np.array(values) if type(values) in [list, np.ndarray] else None
            if group_values is not None and len(group_values) == len(reference):
                wilcoxon_results[group] = wilcoxon(reference, group_values)[1]
                ttest_results[group] = ttest_rel(reference, group_values)[1]
            else:
                wilcoxon_results[group] = None
                ttest_results[group] = None
        test_results['wilcoxon'] = wilcoxon_results
        test_results['ttest'] = ttest_results
    friedman_values = [np.array(values) for values in data.values() if type(values) in [list, np.ndarray] and len(values) > 2]
    if len(friedman_values) > 1:
        friedman_stat, friedman_p_value = friedmanchisquare(*friedman_values)
        test_results['friedman'] = friedman_p_value if not np.isnan(friedman_stat) else None
    return test_results

def post_hoc_analysis(summary_df, alpha=0.05):
    data_array = summary_df[['Mean Error', 'Std Dev']].values
    post_hoc_results = {
        'Bonferroni': sp.posthoc_dunn(data_array, p_adjust='bonferroni'),
        'Holm': sp.posthoc_dunn(data_array, p_adjust='holm'),
        'Holland': sp.posthoc_dunn(data_array, p_adjust='holland'),
        'Hochberg': sp.posthoc_dunn(data_array, p_adjust='hochberg'),
        'Hommel': sp.posthoc_dunn(data_array, p_adjust='hommel')
    }
    return post_hoc_results

def bad_sheet_name(name):
    return "".join("_" if char in r'\/?*[]' else char for char in name)

def save_summary(mean_errors, std_devs, shapiro_results, folder_name, output_file_prefix):
    summary = [{
        "Group": group,
        "Mean Error": mean_errors[group],
        "Std Dev": std_devs[group],
        "Shapiro p-value": shapiro_results[group]
    } for group in mean_errors.keys()]

    summary_df = pd.DataFrame(summary)
    summary_file = f"{output_file_prefix}_{folder_name}.xlsx"

    with pd.ExcelWriter(summary_file, engine='openpyxl') as writer:
        for function, data in mean_errors.items():
            corrected_name = bad_sheet_name(function)
            df = pd.DataFrame({
                "Mean Error": [mean_errors[function]],
                "Std Dev": [std_devs[function]],
                "Shapiro p-value": [shapiro_results[function]]
            })
            df.to_excel(writer, sheet_name=corrected_name, index=False)

    return summary_df, summary_file

def process_xlsx(data_zip, file):
    with data_zip.open(file) as f:
        df = pd.read_excel(f, sheet_name=None)
    return df

def calculate_errors(full_data, actual_data):
    errors, mean_errors, std_devs = {}, {}, {}
    for sheet_name, sheet_df in full_data.items():
        actual_sheet = actual_data[sheet_name]
        sheet_errors = (sheet_df.values.flatten() - actual_sheet.values.flatten()).tolist()
        errors[sheet_name] = sheet_errors
        mean_errors[sheet_name] = np.mean(sheet_errors)
        std_devs[sheet_name] = np.std(sheet_df.values)
    return errors, mean_errors, std_devs

def generate_plots(data_10d, data_20d, output_file_prefix):
    for group, data in data_10d.items():
        for sheet_name, values in data.items():
            title_prefix = f"{group} {sheet_name} 10-D"
            plot_data(values, title_prefix)
    for group, data in data_20d.items():
        for sheet_name, values in data.items():
            title_prefix = f"{group} {sheet_name} 20-D"
            plot_data(values, title_prefix)

def process_zip(data_zip_path, output_file_prefix):
    with zipfile.ZipFile(data_zip_path, 'r') as data_zip:
        data_10d, data_20d = {}, {}
        mean_errors_10d, std_devs_10d = {}, {}
        mean_errors_20d, std_devs_20d = {}, {}

        for file in data_zip.namelist():
            if '_full' in file and '10-D' in file:
                full_data = process_xlsx(data_zip, file)
                actual_data = process_xlsx(data_zip, file.replace('_full', '_actual'))
                errors, mean_error, std_dev = calculate_errors(full_data, actual_data)
                data_10d[file] = errors
                mean_errors_10d[file] = mean_error
                std_devs_10d[file] = std_dev
            elif '_full' in file and '20-D' in file:
                full_data = process_xlsx(data_zip, file)
                actual_data = process_xlsx(data_zip, file.replace('_full', '_actual'))
                errors, mean_error, std_dev = calculate_errors(full_data, actual_data)
                data_20d[file] = errors
                mean_errors_20d[file] = mean_error
                std_devs_20d[file] = std_dev

        test_results_10d = perform_tests(mean_errors_10d)
        test_results_20d = perform_tests(mean_errors_20d)

        summary_df_10d, summary_file_10d = save_summary(mean_errors_10d, std_devs_10d, test_results_10d['shapiro'], '10-D', output_file_prefix)
        summary_df_20d, summary_file_20d = save_summary(mean_errors_20d, std_devs_20d, test_results_20d['shapiro'], '20-D', output_file_prefix)

        posthoc_10d = post_hoc_analysis(summary_df_10d)
        posthoc_20d = post_hoc_analysis(summary_df_20d)

        generate_plots(data_10d, data_20d, output_file_prefix)

process_zip("Test and Actual.zip", "output")

  res = hypotest_fun_out(*samples, **kwds)


TypeError: '<' not supported between instances of 'dict' and 'dict'