In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import seaborn as sns

# Load data 
file_path = 'NG.csv'
sheet3_data = pd.read_csv(file_path)

# Preprocess the data
# Replace sample size < 2 with "Other"
type_counts = sheet3_data['Type'].value_counts()
sheet3_data['Adjusted_Type'] = sheet3_data['Type'].apply(
    lambda x: x if type_counts[x] >= 2 else 'Other'
)

# Calculate means for each group
group_means = sheet3_data.groupby('Adjusted_Type')[['Input', 'output']].mean()

from scipy.stats import f_oneway, t

def newman_keuls(data, group_column, value_column):
    """
    Objective of the Newman-Keuls Test:
    1. Determine whether there are statistically significant differences between multiple groups.
    2. If the p-value is very small, it indicates that some group means differ significantly, meaning the factor (e.g., Adjusted_Type) has a notable impact on the dependent variable (e.g., Input or Output).
    """
    
    grouped = data.groupby(group_column)[value_column].agg(['mean', 'count', 'var']).sort_values('mean', ascending=False)
    groups = grouped.index.tolist()
    samples = [data[data[group_column] == group][value_column].dropna() for group in groups]
    
    anova_result = f_oneway(*samples)
    p_value = anova_result.pvalue
    
    print(f"ANOVA test p-value: {p_value:.5f}")
    
    if p_value >= 0.05:
        print("No significant differences between groups.")
        return grouped, None  # No significant differences
    
    print("Significant differences detected between groups.")
    
    comparisons = []
    
    for i, group_i in enumerate(groups[:-1]):
        for j, group_j in enumerate(groups[i+1:], start=i+1):
            mean_diff = abs(grouped.loc[group_i, 'mean'] - grouped.loc[group_j, 'mean'])
            n_i, n_j = grouped.loc[group_i, 'count'], grouped.loc[group_j, 'count']
            var_i, var_j = grouped.loc[group_i, 'var'], grouped.loc[group_j, 'var']
            
            pooled_var = ((n_i - 1) * var_i + (n_j - 1) * var_j) / (n_i + n_j - 2)
            se_diff = np.sqrt(pooled_var * (1 / n_i + 1 / n_j))
            q_value = mean_diff / se_diff
            
            df = n_i + n_j - 2
            pairwise_p_value = 1 - t.cdf(q_value, df)
            
            comparisons.append((group_i, group_j, mean_diff, q_value, pairwise_p_value))
    
    # Displaying the results as a table
    comparison_df = pd.DataFrame(comparisons, columns=["Group 1", "Group 2", "Mean Difference", "Q Value", "p-value"])
    print(comparison_df.to_string(index=False))
    
    return grouped, comparison_df

# Perform Newman-Keuls for Input
input_grouped, input_comparisons = newman_keuls(sheet3_data, 'Adjusted_Type', 'Input')

# Perform Newman-Keuls for Output
output_grouped, output_comparisons = newman_keuls(sheet3_data, 'Adjusted_Type', 'output')

ANOVA test p-value: 0.00000
Significant differences detected between groups.
            Group 1             Group 2  Mean Difference  Q Value      p-value
       Bubble Chart        Venn diagram         0.000000      NaN          NaN
       Bubble Chart        Venn Diagram         0.208333 0.347140 3.657559e-01
       Bubble Chart          Upset Plot         0.333333 0.774597 2.475127e-01
       Bubble Chart       Stacked chart         0.333333 0.556009 2.906903e-01
       Bubble Chart         Violin plot         0.355072 0.848862 1.987142e-01
       Bubble Chart           Box chart         0.373626 0.686673 2.465815e-01
       Bubble Chart        Volcano Plot         0.384615 0.548567 2.962996e-01
       Bubble Chart               Other         0.466667 1.242625 1.165408e-01
       Bubble Chart       Survival plot         0.500000 0.801784 2.266267e-01
       Bubble Chart         Radar Chart         0.500000 1.000000 2.113249e-01
       Bubble Chart      Composite Plot         0.5000

  q_value = mean_diff / se_diff
  q_value = mean_diff / se_diff
  q_value = mean_diff / se_diff
  q_value = mean_diff / se_diff


In [1]:
# Load data 
file_path = 'JSSC.csv'
sheet3_data = pd.read_csv(file_path)

# Preprocess the data
# Replace sample size < 2 with "Other"
type_counts = sheet3_data['Type'].value_counts()
sheet3_data['Adjusted_Type'] = sheet3_data['Type'].apply(
    lambda x: x if type_counts[x] >= 2 else 'Other'
)

# Calculate means for each group
group_means = sheet3_data.groupby('Adjusted_Type')[['Input', 'output']].mean()

from scipy.stats import f_oneway, t

def newman_keuls(data, group_column, value_column):
    """
    Objective of the Newman-Keuls Test:
    1. Determine whether there are statistically significant differences between multiple groups.
    2. If the p-value is very small, it indicates that some group means differ significantly, meaning the factor (e.g., Adjusted_Type) has a notable impact on the dependent variable (e.g., Input or Output).
    """
    
    grouped = data.groupby(group_column)[value_column].agg(['mean', 'count', 'var']).sort_values('mean', ascending=False)
    groups = grouped.index.tolist()
    samples = [data[data[group_column] == group][value_column].dropna() for group in groups]
    
    anova_result = f_oneway(*samples)
    p_value = anova_result.pvalue
    
    print(f"ANOVA test p-value: {p_value:.5f}")
    
    if p_value >= 0.05:
        print("No significant differences between groups.")
        return grouped, None  # No significant differences
    
    print("Significant differences detected between groups.")
    
    comparisons = []
    
    for i, group_i in enumerate(groups[:-1]):
        for j, group_j in enumerate(groups[i+1:], start=i+1):
            mean_diff = abs(grouped.loc[group_i, 'mean'] - grouped.loc[group_j, 'mean'])
            n_i, n_j = grouped.loc[group_i, 'count'], grouped.loc[group_j, 'count']
            var_i, var_j = grouped.loc[group_i, 'var'], grouped.loc[group_j, 'var']
            
            pooled_var = ((n_i - 1) * var_i + (n_j - 1) * var_j) / (n_i + n_j - 2)
            se_diff = np.sqrt(pooled_var * (1 / n_i + 1 / n_j))
            q_value = mean_diff / se_diff
            
            df = n_i + n_j - 2
            pairwise_p_value = 1 - t.cdf(q_value, df)
            
            comparisons.append((group_i, group_j, mean_diff, q_value, pairwise_p_value))
    
    # Displaying the results as a table
    comparison_df = pd.DataFrame(comparisons, columns=["Group 1", "Group 2", "Mean Difference", "Q Value", "p-value"])
    print(comparison_df.to_string(index=False))
    
    return grouped, comparison_df

# Perform Newman-Keuls for Input
input_grouped, input_comparisons = newman_keuls(sheet3_data, 'Adjusted_Type', 'Input')

# Perform Newman-Keuls for Output
output_grouped, output_comparisons = newman_keuls(sheet3_data, 'Adjusted_Type', 'output')

ANOVA test p-value: 0.00000
Significant differences detected between groups.
                 Group 1                  Group 2  Mean Difference  Q Value      p-value
       Stacked Bars Plot             3D Line Plot         0.333333 0.500000 3.216650e-01
       Stacked Bars Plot               Trace Plot         0.666667 0.894427 2.108241e-01
       Stacked Bars Plot                  Heatmap         1.133333 2.756713 4.171704e-03
       Stacked Bars Plot             Scatter Plot         1.142857 3.574100 8.469600e-04
       Stacked Bars Plot                 Box Plot         1.256410 3.873790 8.432951e-04
       Stacked Bars Plot                Histogram         1.293333 3.908241 2.968190e-04
       Stacked Bars Plot                Line Plot         1.299145 7.437881 8.980039e-12
       Stacked Bars Plot              Binary Plot         1.333333 1.549193 1.095510e-01
       Stacked Bars Plot                Bar Chart         1.333333 4.113767 8.593842e-04
       Stacked Bars Plot         

  q_value = mean_diff / se_diff
  q_value = mean_diff / se_diff
  q_value = mean_diff / se_diff
  q_value = mean_diff / se_diff


In [2]:
# Load data 
file_path = 'AE.csv'
sheet3_data = pd.read_csv(file_path)

# Preprocess the data
# Replace sample size < 2 with "Other"
type_counts = sheet3_data['Type'].value_counts()
sheet3_data['Adjusted_Type'] = sheet3_data['Type'].apply(
    lambda x: x if type_counts[x] >= 2 else 'Other'
)

# Calculate means for each group
group_means = sheet3_data.groupby('Adjusted_Type')[['Input', 'output']].mean()

from scipy.stats import f_oneway, t

def newman_keuls(data, group_column, value_column):
    """
    Objective of the Newman-Keuls Test:
    1. Determine whether there are statistically significant differences between multiple groups.
    2. If the p-value is very small, it indicates that some group means differ significantly, meaning the factor (e.g., Adjusted_Type) has a notable impact on the dependent variable (e.g., Input or Output).
    """
    
    grouped = data.groupby(group_column)[value_column].agg(['mean', 'count', 'var']).sort_values('mean', ascending=False)
    groups = grouped.index.tolist()
    samples = [data[data[group_column] == group][value_column].dropna() for group in groups]
    
    anova_result = f_oneway(*samples)
    p_value = anova_result.pvalue
    
    print(f"ANOVA test p-value: {p_value:.5f}")
    
    if p_value >= 0.05:
        print("No significant differences between groups.")
        return grouped, None  # No significant differences
    
    print("Significant differences detected between groups.")
    
    comparisons = []
    
    for i, group_i in enumerate(groups[:-1]):
        for j, group_j in enumerate(groups[i+1:], start=i+1):
            mean_diff = abs(grouped.loc[group_i, 'mean'] - grouped.loc[group_j, 'mean'])
            n_i, n_j = grouped.loc[group_i, 'count'], grouped.loc[group_j, 'count']
            var_i, var_j = grouped.loc[group_i, 'var'], grouped.loc[group_j, 'var']
            
            pooled_var = ((n_i - 1) * var_i + (n_j - 1) * var_j) / (n_i + n_j - 2)
            se_diff = np.sqrt(pooled_var * (1 / n_i + 1 / n_j))
            q_value = mean_diff / se_diff
            
            df = n_i + n_j - 2
            pairwise_p_value = 1 - t.cdf(q_value, df)
            
            comparisons.append((group_i, group_j, mean_diff, q_value, pairwise_p_value))
    
    # Displaying the results as a table
    comparison_df = pd.DataFrame(comparisons, columns=["Group 1", "Group 2", "Mean Difference", "Q Value", "p-value"])
    print(comparison_df.to_string(index=False))
    
    return grouped, comparison_df

# Perform Newman-Keuls for Input
input_grouped, input_comparisons = newman_keuls(sheet3_data, 'Adjusted_Type', 'Input')

# Perform Newman-Keuls for Output
output_grouped, output_comparisons = newman_keuls(sheet3_data, 'Adjusted_Type', 'output')

ANOVA test p-value: 0.00169
Significant differences detected between groups.
                Group 1                 Group 2  Mean Difference  Q Value  p-value
              Pie Chart        Time Series Plot         0.300000 0.159086 0.438023
              Pie Chart         3D Surface Plot         0.566667 0.280545 0.392133
              Pie Chart        Stacked Bar Plot         0.587500 0.492298 0.313491
              Pie Chart                Box Plot         0.850000 0.856486 0.199500
              Pie Chart              Polar Plot         0.900000 0.365781 0.361074
              Pie Chart               Wind Rose         0.900000 0.365781 0.361074
              Pie Chart   Verticle Profile Plot         0.900000 0.369461 0.359744
              Pie Chart             Vector Plot         1.400000 0.573270 0.289562
              Pie Chart                Bar Plot         1.435714 2.360844 0.010645
              Pie Chart Geographic Scatter Plot         1.650000 0.967836 0.176114
          

  q_value = mean_diff / se_diff
  q_value = mean_diff / se_diff
  q_value = mean_diff / se_diff
  q_value = mean_diff / se_diff


In [8]:
# Load data 
file_path = 'JABE.csv'
sheet3_data = pd.read_csv(file_path)

# Replace sample size < 2 with "Other", handling NaN values
type_counts = sheet3_data['Type'].dropna().value_counts()

# Use `.get()` to safely access values and default to "Other" for NaN values
sheet3_data['Adjusted_Type'] = sheet3_data['Type'].apply(
    lambda x: x if pd.notna(x) and type_counts.get(x, 0) >= 2 else 'Other'
)

sheet3_data['Input'] = pd.to_numeric(sheet3_data['Input'], errors='coerce')
sheet3_data['output'] = pd.to_numeric(sheet3_data['output'], errors='coerce')

# Calculate means for each group
group_means = sheet3_data.groupby('Adjusted_Type')[['Input', 'output']].mean()

from scipy.stats import f_oneway, t

def newman_keuls(data, group_column, value_column):
    """
    Objective of the Newman-Keuls Test:
    1. Determine whether there are statistically significant differences between multiple groups.
    2. If the p-value is very small, it indicates that some group means differ significantly, meaning the factor (e.g., Adjusted_Type) has a notable impact on the dependent variable (e.g., Input or Output).
    """
    
    grouped = data.groupby(group_column)[value_column].agg(['mean', 'count', 'var']).sort_values('mean', ascending=False)
    groups = grouped.index.tolist()
    samples = [data[data[group_column] == group][value_column].dropna() for group in groups]
    
    anova_result = f_oneway(*samples)
    p_value = anova_result.pvalue
    
    print(f"ANOVA test p-value: {p_value:.5f}")
    
    if p_value >= 0.05:
        print("No significant differences between groups.")
        return grouped, None  # No significant differences
    
    print("Significant differences detected between groups.")
    
    comparisons = []
    
    for i, group_i in enumerate(groups[:-1]):
        for j, group_j in enumerate(groups[i+1:], start=i+1):
            mean_diff = abs(grouped.loc[group_i, 'mean'] - grouped.loc[group_j, 'mean'])
            n_i, n_j = grouped.loc[group_i, 'count'], grouped.loc[group_j, 'count']
            var_i, var_j = grouped.loc[group_i, 'var'], grouped.loc[group_j, 'var']
            
            pooled_var = ((n_i - 1) * var_i + (n_j - 1) * var_j) / (n_i + n_j - 2)
            se_diff = np.sqrt(pooled_var * (1 / n_i + 1 / n_j))
            q_value = mean_diff / se_diff
            
            df = n_i + n_j - 2
            pairwise_p_value = 1 - t.cdf(q_value, df)
            
            comparisons.append((group_i, group_j, mean_diff, q_value, pairwise_p_value))
    
    # Displaying the results as a table
    comparison_df = pd.DataFrame(comparisons, columns=["Group 1", "Group 2", "Mean Difference", "Q Value", "p-value"])
    print(comparison_df.to_string(index=False))
    
    return grouped, comparison_df

# Perform Newman-Keuls for Input
input_grouped, input_comparisons = newman_keuls(sheet3_data, 'Adjusted_Type', 'Input')

# Perform Newman-Keuls for Output
output_grouped, output_comparisons = newman_keuls(sheet3_data, 'Adjusted_Type', 'output')

ANOVA test p-value: 0.87542
No significant differences between groups.
ANOVA test p-value: 0.43940
No significant differences between groups.
