In [2]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols


data = pd.DataFrame({
    'Factor1': ['A', 'A', 'B', 'B', 'A', 'A', 'B', 'B'],
    'Factor2': ['X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y'],
    'Value': [10, 12, 15, 14, 18, 20, 22, 24]
})

model = ols('Value ~ Factor1 * Factor2', data=data).fit()


anova_table = sm.stats.anova_lm(model, typ=2)

main_effect_Factor1 = anova_table.loc['Factor1', 'F']
main_effect_Factor2 = anova_table.loc['Factor2', 'F']
interaction_effect = anova_table.loc['Factor1:Factor2', 'F']


print("Main Effect of Factor1:", main_effect_Factor1)
print("Main Effect of Factor2:", main_effect_Factor2)
print("Interaction Effect:", interaction_effect)


Main Effect of Factor1: 0.81227436823105
Main Effect of Factor2: 0.09025270758122757
Interaction Effect: 0.03249097472924198


In [None]:
Q6. Suppose you conducted a one-way ANOVA and obtained an F-statistic of 5.23 and a p-value of 0.02.
What can you conclude about the differences between the groups, and how would you interpret these
results?

In [None]:
Q7. In a repeated measures ANOVA, how would you handle missing data, and what are the potential
consequences of using different methods to handle missing data?

In [None]:
Q8. What are some common post-hoc tests used after ANOVA, and when would you use each one? Provide
an example of a situation where a post-hoc test might be necessary.

In [None]:
Q9. A researcher wants to compare the mean weight loss of three diets: A, B, and C. They collect data from
50 participants who were randomly assigned to one of the diets. Conduct a one-way ANOVA using Python
to determine if there are any significant differences between the mean weight loss of the three diets.
Report the F-statistic and p-value, and interpret the results.

In [3]:
import numpy as np
import scipy.stats as stats


diet_A = np.array([1.5, 2.0, 2.2, 1.8, 2.5, 2.3, 2.7, 1.9, 2.1, 2.0, 1.6, 1.8, 2.2, 2.4, 2.1, 1.7, 2.3, 1.6, 1.9, 2.0, 2.4, 2.5, 2.1, 1.7, 1.8])
diet_B = np.array([1.0, 1.2, 1.5, 1.3, 1.4, 1.0, 1.2, 1.6, 1.1, 1.3, 1.4, 1.2, 1.5, 1.6, 1.1, 1.2, 1.0, 1.3, 1.4, 1.1, 1.3, 1.5, 1.4, 1.2, 1.0])
diet_C = np.array([3.5, 3.8, 4.0, 3.7, 4.2, 3.9, 4.1, 3.6, 4.3, 4.2, 4.0, 3.8, 4.1, 3.7, 4.2, 3.6, 4.0, 3.9, 3.8, 4.1, 3.7, 4.2, 3.6, 4.1, 3.8])

f_statistic, p_value = stats.f_oneway(diet_A, diet_B, diet_C)

alpha = 0.05  

if p_value < alpha:
    print("The one-way ANOVA is statistically significant.")
    print(f"F-statistic: {f_statistic:.4f}")
    print(f"P-value: {p_value:.4f}")
    print("There are significant differences between the mean weight loss of the three diets.")
else:
    print("The one-way ANOVA is not statistically significant.")
    print(f"F-statistic: {f_statistic:.4f}")
    print(f"P-value: {p_value:.4f}")
    print("There are no significant differences between the mean weight loss of the three diets.")


The one-way ANOVA is statistically significant.
F-statistic: 725.6043
P-value: 0.0000
There are significant differences between the mean weight loss of the three diets.


In [None]:
Q10. A company wants to know if there are any significant differences in the average time it takes to
complete a task using three different software programs: Program A, Program B, and Program C. They
randomly assign 30 employees to one of the programs and record the time it takes each employee to
complete the task. Conduct a two-way ANOVA using Python to determine if there are any main effects or
interaction effects between the software programs and employee experience level (novice vs.
experienced). Report the F-statistics and p-values, and interpret the results.

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

data = pd.DataFrame({
    'Software': ['A', 'B', 'C'] * 10,
    'Experience': ['Novice', 'Experienced'] * 15,
    'Time': [12, 14, 15, 13, 16, 14, 11, 12, 15, 13, 14, 15, 12, 13, 16, 11, 12, 14, 14, 15, 15, 13, 12, 16, 13, 14, 14, 15, 13, 16]
})


model = ols('Time ~ Software * Experience', data=data).fit()

anova_table = sm.stats.anova_lm(model, typ=2)

alpha = 0.05 

print("Two-Way ANOVA Results:")
print(anova_table)

p_interaction = anova_table.loc['Software:Experience', 'PR(>F)']
p_software = anova_table.loc['Software', 'PR(>F)']
p_experience = anova_table.loc['Experience', 'PR(>F)']

if p_interaction < alpha:
    print("There is a significant interaction effect between software and experience level.")
else:
    print("There is no significant interaction effect between software and experience level.")

if p_software < alpha:
    print("There is a significant main effect of software.")
else:
    print("There is no significant main effect of software.")

if p_experience < alpha:
    print("There is a significant main effect of experience level.")
else:
    print("There is no significant main effect of experience level.")


In [None]:
Q11. An educational researcher is interested in whether a new teaching method improves student test
scores. They randomly assign 100 students to either the control group (traditional teaching method) or the
experimental group (new teaching method) and administer a test at the end of the semester. Conduct a
two-sample t-test using Python to determine if there are any significant differences in test scores
between the two groups. If the results are significant, follow up with a post-hoc test to determine which
group(s) differ significantly from each other.

In [4]:
import numpy as np
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd


control_group = np.array([85, 78, 90, 79, 88, 92, 76, 81, 84, 86, 80, 83, 79, 89, 75, 82, 87, 91, 77, 85, 80, 76, 83, 81, 88, 90, 79, 82, 84, 86, 87, 78, 89, 77, 92, 85, 88, 83, 80, 86, 90, 82, 79, 81, 84, 85, 87, 88, 91])
experimental_group = np.array([92, 89, 94, 91, 93, 95, 87, 90, 92, 93, 90, 91, 88, 94, 86, 91, 93, 95, 88, 92, 89, 87, 90, 91, 94, 95, 89, 90, 91, 93, 92, 87, 93, 88, 95, 92, 93, 91, 89, 94, 94, 90, 88, 90, 92, 93, 93, 95])

t_statistic, p_value = stats.ttest_ind(control_group, experimental_group)

alpha = 0.05  

if p_value < alpha:
    print("The two-sample t-test is statistically significant.")
    print(f"T-statistic: {t_statistic:.4f}")
    print(f"P-value: {p_value:.4f}")
    print("There is a significant difference in test scores between the two groups.")
else:
    print("The two-sample t-test is not statistically significant.")
    print(f"T-statistic: {t_statistic:.4f}")
    print(f"P-value: {p_value:.4f}")
    print("There is no significant difference in test scores between the two groups.")

if p_value < alpha:
    data = np.concatenate((control_group, experimental_group))
    group_labels = ['Control'] * len(control_group) + ['Experimental'] * len(experimental_group)
    
    tukey_results = pairwise_tukeyhsd(data, group_labels)
    print(tukey_results)


The two-sample t-test is statistically significant.
T-statistic: -9.7431
P-value: 0.0000
There is a significant difference in test scores between the two groups.
  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1    group2    meandiff p-adj lower  upper reject
-------------------------------------------------------
Control Experimental   7.4549   0.0 5.9359 8.974   True
-------------------------------------------------------


In [None]:
Q12. A researcher wants to know if there are any significant differences in the average daily sales of three
retail stores: Store A, Store B, and Store C. They randomly select 30 days and record the sales for each store
on those days. Conduct a repeated measures ANOVA using Python to determine if there are any

significant differences in sales between the three stores. If the results are significant, follow up with a post-
hoc test to determine which store(s) differ significantly from each other.