In [8]:
import numpy as np              #import numpy with the alias np
import matplotlib.pyplot as plt #import matplotlib.pyplot with the alias plt
import pandas as pd             #import pandas with the alias pd
import seaborn as sns           #import seaborn with the alias sns
import scipy.stats as ss

In [9]:
iris = pd.read_csv("iris.csv")

Computing confidence intervals ‘Manualy’, computing the t-critical values, sample mean and sample standard deviation.
1. Choose a feature.
2. Iterate over classes
3. Compute mean for that class and feature
4. Compute standard deviation for that class and feature
5. Iterate over alpha values.
6. Compute t-critical value using ppf function.
7. Compute standard error for the mean.
8. Compute confidence intervals

In [10]:
feature = "petal_length"
print(f"Chosen feature: {feature}")
ROUND = 4

for class_ in iris['species'].unique():
    data = iris[iris["species"] == class_][feature]
    mean = np.mean(data)  # Sample mean
    std = np.std(data, ddof=1)  # Sample standard deviation (ddof=1 for sample)
    n = len(data)  # Sample size
    
    print("_" * 5 + class_ + "_" * 5)
    print(f"Mean: {mean}")
    print(f"Standard Deviation: {std}")
    
    for alpha in [0.05, 0.01, 0.001]:
        # Degrees of freedom
        df = n - 1
        
        # Compute t-critical value manually using the inverse of the CDF for t-distribution
        t_critical = ss.t.ppf(1 - alpha / 2, df)  # Two-tailed test
        
        # Standard error of the mean
        sem = std / np.sqrt(n)
        
        # Compute the confidence interval manually
        ci_lower = mean - t_critical * sem
        ci_upper = mean + t_critical * sem
        
        print(f"Confidence interval α={alpha} -> ({round(ci_lower, ROUND)}, {round(ci_upper, ROUND)})")


Chosen feature: petal_length
_____Iris-setosa_____
Mean: 1.464
Standard Deviation: 0.17351115943644546
Confidence interval α=0.05 -> (1.4147, 1.5133)
Confidence interval α=0.01 -> (1.3982, 1.5298)
Confidence interval α=0.001 -> (1.3781, 1.5499)
_____Iris-versicolor_____
Mean: 4.26
Standard Deviation: 0.46991097723995795
Confidence interval α=0.05 -> (4.1265, 4.3935)
Confidence interval α=0.01 -> (4.0819, 4.4381)
Confidence interval α=0.001 -> (4.0274, 4.4926)
_____Iris-virginica_____
Mean: 5.5520000000000005
Standard Deviation: 0.5518946956639834
Confidence interval α=0.05 -> (5.3952, 5.7088)
Confidence interval α=0.01 -> (5.3428, 5.7612)
Confidence interval α=0.001 -> (5.2788, 5.8252)


Computing confidence intervals using the Python function stats.t.interval()
1. Choose a feature.
2. Iterate over classes
3. Compute mean for that class and feature
4. Iterate over alpha values
5. Compute confidence intervals

In [11]:
feature = "petal_length"
print(f"Chosen feature: {feature}")
ROUND = 4

for class_ in iris['species'].unique():  
    data = iris[iris["species"]== class_][feature]
    mean = np.mean(data)
    print("_"*5 + class_ + "_"*5)
    print(f"Mean: {mean}")
    
    for alpha in [0.05, 0.01, 0.001]:
        sem = ss.sem(data)
        ci = ss.t.interval(
            confidence=1-alpha,
            df=len(data)-1,
            loc=mean,
            scale=sem
        )
        print(f"Confidence interval α={alpha} -> ({round(ci[0], ROUND)}, {round(ci[1], ROUND)})")


Chosen feature: petal_length
_____Iris-setosa_____
Mean: 1.464
Confidence interval α=0.05 -> (1.4147, 1.5133)
Confidence interval α=0.01 -> (1.3982, 1.5298)
Confidence interval α=0.001 -> (1.3781, 1.5499)
_____Iris-versicolor_____
Mean: 4.26
Confidence interval α=0.05 -> (4.1265, 4.3935)
Confidence interval α=0.01 -> (4.0819, 4.4381)
Confidence interval α=0.001 -> (4.0274, 4.4926)
_____Iris-virginica_____
Mean: 5.5520000000000005
Confidence interval α=0.05 -> (5.3952, 5.7088)
Confidence interval α=0.01 -> (5.3428, 5.7612)
Confidence interval α=0.001 -> (5.2788, 5.8252)


In [22]:
from scipy.stats import shapiro, anderson

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Assuming 'iris' dataset is loaded and contains the features and species classes
features = iris.columns[:-1]  # Excluding the species column
classes = iris['species'].unique()  # Unique species classes

# Significance levels
alphas = [0.05, 0.01]

# Initialize results table as a list of dictionaries
results_shapiro = []
results_anderson = []

for feature in features:
    for class_ in classes:
        data = iris[iris['species'] == class_][feature].values
        
        # Perform Shapiro-Wilk Test
        shapiro_stat, shapiro_p = shapiro(data)
        
        # Perform Anderson-Darling Test (assumes normal distribution by default)
        anderson_result = anderson(data, dist='norm')
        anderson_stat = anderson_result.statistic
        anderson_critical_values = anderson_result.critical_values
        
        # Record results for Shapiro-Wilk test
        for alpha in alphas:
            decision_shapiro = "Reject" if shapiro_p < alpha else "Accept"
            results_shapiro.append({
                "Test": "Shapiro-Wilk",
                "Feature": feature,
                "Class": class_,
                "Alpha": alpha,
                "Statistic": shapiro_stat,
                "P-value": shapiro_p,
                "Decision": decision_shapiro
            })
        
        # Record results for Anderson-Darling test
        for alpha, critical_value in zip(alphas, anderson_critical_values[:2]):
            decision_anderson = "Reject" if anderson_stat > critical_value else "Accept"
            results_anderson.append({
                "Test": "Anderson-Darling",
                "Feature": feature,
                "Class": class_,
                "Alpha": alpha,
                "Statistic": anderson_stat,
                "Critical Value": critical_value,
                "Decision": decision_anderson
            })

# Display the results as DataFrame
print(pd.DataFrame(results_shapiro))
print(pd.DataFrame(results_anderson))


            Test       Feature            Class  Alpha  Statistic   P-value Decision
0   Shapiro-Wilk  sepal_length      Iris-setosa   0.05   0.977699  0.459513   Accept
1   Shapiro-Wilk  sepal_length      Iris-setosa   0.01   0.977699  0.459513   Accept
2   Shapiro-Wilk  sepal_length  Iris-versicolor   0.05   0.977836  0.464737   Accept
3   Shapiro-Wilk  sepal_length  Iris-versicolor   0.01   0.977836  0.464737   Accept
4   Shapiro-Wilk  sepal_length   Iris-virginica   0.05   0.971179  0.258315   Accept
5   Shapiro-Wilk  sepal_length   Iris-virginica   0.01   0.971179  0.258315   Accept
6   Shapiro-Wilk   sepal_width      Iris-setosa   0.05   0.968692  0.204657   Accept
7   Shapiro-Wilk   sepal_width      Iris-setosa   0.01   0.968692  0.204657   Accept
8   Shapiro-Wilk   sepal_width  Iris-versicolor   0.05   0.974133  0.337995   Accept
9   Shapiro-Wilk   sepal_width  Iris-versicolor   0.01   0.974133  0.337995   Accept
10  Shapiro-Wilk   sepal_width   Iris-virginica   0.05   0.967391