In [4]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# Load the dataset (assuming it's in tab-separated format)
file_path = 'height.dat'  # Update with your file path
data = pd.read_csv(file_path, sep='\t')

# Print the data to verify
print(data.head())

# Perform the ANOVA
model = ols('Value ~ C(Part) + C(Operator) + C(Part):C(Operator)', data=data).fit()
anova_results = anova_lm(model)

# Print ANOVA table
print(anova_results)

# Question 6: Degrees of freedom for the factor Part
df_part = anova_results.loc['C(Part)', 'df']
print(f"Degrees of freedom for Part: {df_part}")

# Question 7: Degrees of freedom for the factor Operator
df_operator = anova_results.loc['C(Operator)', 'df']
print(f"Degrees of freedom for Operator: {df_operator}")

# Question 8: Degrees of freedom for the factor Part:Operator interaction
df_part_operator = anova_results.loc['C(Part):C(Operator)', 'df']
print(f"Degrees of freedom for Part:Operator interaction: {df_part_operator}")

# Question 9: Statistically significant factors
significant_factors = anova_results[anova_results['PR(>F)'] < 0.05].index.tolist()
print(f"Statistically significant factors: {significant_factors}")

# Question 10: Variance due to repeatability (Mean Square Within)
mean_square_within = anova_results.loc['Residual', 'mean_sq']
print(f"Variance due to repeatability (Mean Square Within): {mean_square_within}")

# Question 11: Largest component of variance
variance_components = {
    'Repeatability': mean_square_within,
    'Reproducibility': anova_results.loc['C(Operator)', 'mean_sq'],
    'Part-to-Part': anova_results.loc['C(Part)', 'mean_sq']
}
largest_component = max(variance_components, key=variance_components.get)
print(f"Largest component of variance: {largest_component}")

# Question 12: Smallest component of variance
smallest_component = min(variance_components, key=variance_components.get)
print(f"Smallest component of variance: {smallest_component}")

# Question 13: Higher percent contribution, Repeatability or Reproducibility?
total_variance = sum(variance_components.values())
percent_repeatability = (variance_components['Repeatability'] / total_variance) * 100
percent_reproducibility = (variance_components['Reproducibility'] / total_variance) * 100

higher_contribution = 'Repeatability' if percent_repeatability > percent_reproducibility else 'Reproducibility'
print(f"Higher percent contribution: {higher_contribution}")

# Additional calculations for other questions can follow similar steps.


   Part  Operator  Repetition    Value
0     1         1           1  488.418
1     2         1           1  484.798
2     3         1           1  489.445
3     4         1           1  485.758
4     5         1           1  489.043
                       df      sum_sq    mean_sq           F        PR(>F)
C(Part)               9.0  276.387954  30.709773  117.041911  2.599400e-15
C(Operator)           1.0    5.511578   5.511578   21.005873  1.802444e-04
C(Part):C(Operator)   9.0    3.058264   0.339807    1.295082  2.992650e-01
Residual             20.0    5.247654   0.262383         NaN           NaN
Degrees of freedom for Part: 9.0
Degrees of freedom for Operator: 1.0
Degrees of freedom for Part:Operator interaction: 9.0
Statistically significant factors: ['C(Part)', 'C(Operator)']
Variance due to repeatability (Mean Square Within): 0.26238269999999475
Largest component of variance: Part-to-Part
Smallest component of variance: Repeatability
Higher percent contribution: Reproducibilit