In [None]:
# iris_analysis.py
# GitHub Copilot
# Standalone script: loads an Iris dataset (CSV), explores, analyzes, visualizes, and prints findings.

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris

sns.set_theme(style="whitegrid")
plt.rcParams.update({"figure.autolayout": True})

# --- Task 1: Load and Explore the Dataset ---
csv_path = "iris.csv"

if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
    print(f"Loaded dataset from {csv_path}")
else:
    # fallback: create CSV from sklearn iris and then read it to ensure CSV workflow
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df["species"] = pd.Categorical(iris.target_names[iris.target])
    df.to_csv(csv_path, index=False)
    df = pd.read_csv(csv_path)
    print(f"No existing '{csv_path}' found — created and loaded {csv_path} from sklearn dataset")

print("\nFirst 5 rows:")
print(df.head())

print("\nData types:")
print(df.dtypes)

print("\nMissing values per column:")
print(df.isnull().sum())

# Basic cleaning: if missing values exist, fill numeric with median and categorical with mode
if df.isnull().any().any():
    for col in df.columns:
        if df[col].isnull().any():
            if pd.api.types.is_numeric_dtype(df[col]):
                med = df[col].median()
                df[col].fillna(med, inplace=True)
                print(f"Filled missing numeric values in '{col}' with median={med}")
            else:
                mode = df[col].mode().iloc[0]
                df[col].fillna(mode, inplace=True)
                print(f"Filled missing categorical values in '{col}' with mode='{mode}'")
else:
    print("No missing values detected. No cleaning needed.")

# --- Task 2: Basic Data Analysis ---
print("\nBasic statistics (numerical columns):")
print(df.describe())

# Ensure species column exists and is categorical for grouping
if "species" not in df.columns:
    # if species is encoded numeric, try to map using sklearn
    try:
        iris = load_iris()
        df["species"] = pd.Categorical(iris.target_names[iris.target])
    except Exception:
        df["species"] = "unknown"

df["species"] = df["species"].astype("category")

# Group by species and compute mean for numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
group_means = df.groupby("species")[numerical_cols].mean()
print("\nMean values by species:")
print(group_means)

# Additional simple patterns / findings
print("\nInteresting findings (automated summary):")
# largest mean sepal length species
if "sepal length (cm)" in group_means.columns:
    max_sepal_species = group_means["sepal length (cm)"].idxmax()
    print(f"- Species with largest mean sepal length: {max_sepal_species}")
# check petal length vs width correlation
if "petal length (cm)" in numerical_cols and "petal width (cm)" in numerical_cols:
    corr = df[["petal length (cm)", "petal width (cm)"]].corr().iloc[0, 1]
    print(f"- Correlation between petal length and petal width: {corr:.2f}")
print("- Petal measurements are typically most discriminative across species (observed in means).")

# --- Task 3: Data Visualization ---
# 1) Line chart showing "trend" across sample index for one numeric column (simulate timeseries)
plt.figure(figsize=(10, 4))
plt.plot(df.index, df["sepal length (cm)"], label="Sepal Length", alpha=0.8)
plt.plot(df.index, df["petal length (cm)"], label="Petal Length", alpha=0.8)
plt.title("Trend of Sepal and Petal Length across Samples (index as time proxy)")
plt.xlabel("Sample Index")
plt.ylabel("Length (cm)")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# 2) Bar chart: average petal length per species
plt.figure(figsize=(8, 5))
order = group_means.index.tolist()
sns.barplot(x=group_means.index, y=group_means["petal length (cm)"], palette="husl", order=order)
plt.title("Average Petal Length by Species")
plt.xlabel("Species")
plt.ylabel("Avg Petal Length (cm)")
plt.show()

# 3) Histogram: distribution of sepal length
plt.figure(figsize=(8, 5))
plt.hist(df["sepal length (cm)"], bins=15, color="#4c72b0", edgecolor="black", alpha=0.8)
plt.title("Distribution of Sepal Length")
plt.xlabel("Sepal Length (cm)")
plt.ylabel("Frequency")
plt.grid(alpha=0.2)
plt.show()

# 4) Scatter plot: sepal length vs petal length colored by species
plt.figure(figsize=(8, 6))
species_list = df["species"].cat.categories.tolist()
colors = sns.color_palette("husl", n_colors=len(species_list))
for spec, col in zip(species_list, colors):
    sub = df[df["species"] == spec]
    plt.scatter(sub["sepal length (cm)"], sub["petal length (cm)"], label=spec, alpha=0.8, color=col)
plt.title("Sepal Length vs Petal Length by Species")
plt.xlabel("Sepal Length (cm)")
plt.ylabel("Petal Length (cm)")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# Extra: correlation heatmap
plt.figure(figsize=(7, 5))
corr_matrix = df[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", center=0)
plt.title("Feature Correlation Matrix")
plt.show()

# --- Findings / Observations ---
print("\nSummary of findings:")
print("- Dataset loaded from CSV and inspected.")
print("- No missing values were present (or they were filled if found).")
print("- Descriptive statistics show clear differences between species, especially in petal measurements.")
print("- Strong positive correlation observed between petal length and petal width.")
print("- Scatter plots demonstrate species clustering; petal features are most discriminative.")
print("\nScript complete. Visualizations displayed interactively.")