In [2]:
# Importing necessary libraries
import pandas as pd
from scipy.stats import ttest_ind, f_oneway, chi2_contingency

# Load the uploaded CSV file
file_path = '/mnt/data/iris.csv'
iris_data = pd.read_csv('iris.csv')

# Display the first few rows to understand the data structure
iris_data.head()


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [8]:
# Step 1: Select data for the two species
setosa_sepal_length = iris_data[iris_data['Species'] == 'Iris-setosa']['SepalLengthCm']
versicolor_sepal_length = iris_data[iris_data['Species'] == 'Iris-versicolor']['SepalLengthCm']

# Step 2: Calculate means and standard deviations
setosa_mean = setosa_sepal_length.mean()
setosa_std = setosa_sepal_length.std()
versicolor_mean = versicolor_sepal_length.mean()
versicolor_std = versicolor_sepal_length.std()

print("Setosa - Mean:", setosa_mean, "Std Dev:", setosa_std)
print("Versicolor - Mean:", versicolor_mean, "Std Dev:", versicolor_std)

# Step 3: Perform two-sample t-test
t_stat, p_value = ttest_ind(setosa_sepal_length, versicolor_sepal_length)

# Step 4: Output t-statistic and p-value
print("t-statistic:", t_stat, "p-value:", p_value)

# Step 5: Interpret the result
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis: There is a significant difference in sepal lengths.")
else:
    print("Fail to reject the null hypothesis: No significant difference in sepal lengths.")


Setosa - Mean: 5.005999999999999 Std Dev: 0.3524896872134512
Versicolor - Mean: 5.936 Std Dev: 0.5161711470638635
t-statistic: -10.52098626754911 p-value: 8.985235037487077e-18
Reject the null hypothesis: There is a significant difference in sepal lengths.


In [9]:
#ANOVA - To test if there is a significant difference in the sepal lengths across all three species (setosa, versicolor,
#and virginica)
# Step 1: Group data by species
setosa = iris_data[iris_data['Species'] == 'Iris-setosa']['SepalLengthCm']
versicolor = iris_data[iris_data['Species'] == 'Iris-versicolor']['SepalLengthCm']
virginica = iris_data[iris_data['Species'] == 'Iris-virginica']['SepalLengthCm']

# Step 2: Perform one-way ANOVA
f_stat, p_value = f_oneway(setosa, versicolor, virginica)

# Output means and standard deviations
print("Setosa - Mean:", setosa.mean(), "Std Dev:", setosa.std())
print("Versicolor - Mean:", versicolor.mean(), "Std Dev:", versicolor.std())
print("Virginica - Mean:", virginica.mean(), "Std Dev:", virginica.std())

# Step 3: Output F-statistic and p-value
print("F-statistic:", f_stat, "p-value:", p_value)

# Step 4: Interpret the result
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis: At least one species has a different mean sepal length.")
else:
    print("Fail to reject the null hypothesis: No significant difference in sepal lengths.")


Setosa - Mean: 5.005999999999999 Std Dev: 0.3524896872134512
Versicolor - Mean: 5.936 Std Dev: 0.5161711470638635
Virginica - Mean: 6.587999999999998 Std Dev: 0.635879593274432
F-statistic: 119.26450218450468 p-value: 1.6696691907693826e-31
Reject the null hypothesis: At least one species has a different mean sepal length.


In [7]:
# Inspect column names to identify the correct name for 'species'
print(iris_data.columns)


Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')


In [11]:
#chi square - To test whether there is a relationship between species and different categories of sepal width (e.g.,
#narrow, medium, wide).

# Step 1: Categorize sepal widths
bins = [0, 2.5, 3.0, 3.5, 4.5]  # Define bins for sepal widths
labels = ['Narrow', 'Medium', 'Wide', 'Very Wide']  # Corrected: one fewer label than bins

iris_data['SepalWidthCategory'] = pd.cut(iris_data['SepalWidthCm'], bins=bins, labels=labels)

# Step 2: Create contingency table
contingency_table = pd.crosstab(iris_data['Species'], iris_data['SepalWidthCategory'])

print("Contingency Table:")
print(contingency_table)

# Step 3: Perform chi-square test
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

# Step 4: Output chi-square statistic and p-value
print("Chi-square Statistic:", chi2_stat)
print("p-value:", p_value)

# Step 5: Interpret the result
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis: There is a relationship between species and sepal width categories.")
else:
    print("Fail to reject the null hypothesis: No relationship between species and sepal width categories.")


Contingency Table:
SepalWidthCategory  Narrow  Medium  Wide  Very Wide
Species                                            
Iris-setosa              1       7    27         15
Iris-versicolor         13      29     8          0
Iris-virginica           5      28    14          3
Chi-square Statistic: 58.8092440923738
p-value: 7.853326672657147e-11
Reject the null hypothesis: There is a relationship between species and sepal width categories.
