In [1]:
from sklearn.datasets import load_iris
import pandas as pd
from scipy.stats import ttest_ind

# Loading the dataset
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target
iris_df['species_name'] = iris_df['species'].apply(lambda x: iris.target_names[x])

# Filtering the dataset
species_1 = 'setosa'
species_2 = 'versicolor'
species_1_data = iris_df[iris_df['species_name'] == species_1]['petal length (cm)']
species_2_data = iris_df[iris_df['species_name'] == species_2]['petal length (cm)']

# Calculating the mean petal lengths of the two species
mean_species_1 = species_1_data.mean()
mean_species_2 = species_2_data.mean()

# Performing the t-test
t_stat, p_value = ttest_ind(species_1_data, species_2_data, equal_var=False)

mean_species_1, mean_species_2, t_stat, p_value


(1.4620000000000002, 4.26, -39.492719391538095, 9.934432957587695e-46)

Interpretation of Results:
t-statistic: -39.49
p-value: 9.934432957587695e-46)
Given the extremely small p-value ( <0.05, in fact nearly 0 ), we reject the null hypothesis. This indicates strong evidence that the mean petal lengths of Setosa and Versicolor are significantly different.

The t-test already conducted is an independent t-test, where we assumed the samples from the two species are independent of each other(Setosa and Versicolor).

In [6]:
# Simplified code for performing a one-sample Z-test for all species
from statsmodels.stats.weightstats import ztest
import pandas as pd

# Perform Z-tests for each species
results = {
    species: ztest(iris_df[iris_df['species_name'] == species]['petal length (cm)'], value=5)
    for species in iris.target_names
}

# Convert results into a DataFrame
simple_results_df = pd.DataFrame(results, index=['Z-statistic', 'p-value']).T

# Print the DataFrame
print(simple_results_df)


            Z-statistic       p-value
setosa      -144.056560  0.000000e+00
versicolor   -11.135280  8.447936e-29
virginica      7.072417  1.522580e-12


In [8]:
from statsmodels.stats.weightstats import ztest
import pandas as pd

# Define the threshold for sample size
threshold = 30

# Perform Z-tests for each species only if n > 30
results = {}
for species in iris.target_names:
    species_data = iris_df[iris_df['species_name'] == species]['petal length (cm)']
    if len(species_data) > threshold:
        z_stat, p_value = ztest(species_data, value=5)
        results[species] = {'Z-statistic': z_stat, 'p-value': p_value}
    else:
        results[species] = {'Z-statistic': None, 'p-value': 'Sample size < 30'}

# Convert results into a DataFrame
filtered_results_df = pd.DataFrame(results).T

# Display the results
print(filtered_results_df)


            Z-statistic       p-value
setosa      -144.056560  0.000000e+00
versicolor   -11.135280  8.447936e-29
virginica      7.072417  1.522580e-12


Each species sample size is checked against the threshold n>30. If n>30, the Z-test is performed; otherwise, it's skipped, and the result indicates "Sample size < 30".

In [9]:
from scipy.stats import f_oneway

# Extract petal width data for each species
setosa_petal_width = iris_df[iris_df['species_name'] == 'setosa']['petal width (cm)']
versicolor_petal_width = iris_df[iris_df['species_name'] == 'versicolor']['petal width (cm)']
virginica_petal_width = iris_df[iris_df['species_name'] == 'virginica']['petal width (cm)']

# Perform one-way ANOVA
anova_stat, anova_p_value = f_oneway(setosa_petal_width, versicolor_petal_width, virginica_petal_width)

anova_stat, anova_p_value


(960.007146801809, 4.169445839443116e-85)

there is a statistically significant difference in the mean petal widths among the species (Setosa, Versicolor, and Virginica) because of extremly small p value and null hypothesis is rejected

In [10]:
# Calculate the correlation between sepal length and petal length
correlation = iris_df['sepal length (cm)'].corr(iris_df['petal length (cm)'])

correlation

0.8717537758865831

The correlation coefficient is 0.87, which indicates a strong positive correlation between sepal length and petal length.
This means that as the sepal length increases, the petal length also tends to increase proportionally.