
# Analyzing Data with Pandas and Visualizing Results with Matplotlib

**Objective:**  
- Load and analyze a dataset using **pandas**.  
- Create simple plots with **matplotlib** to visualize findings.  

This notebook is self-contained and uses the classic **Iris** dataset. It demonstrates:
- Data loading & error handling  
- Exploration & cleaning of missing values  
- Descriptive statistics and group-by analysis  
- Four distinct matplotlib charts (line, bar, histogram, scatter)


In [None]:

# Imports
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

# Matplotlib: each chart will be on a separate figure; no explicit colors set.
DATA_DIR = "/mnt/data"
CSV_PATH = os.path.join(DATA_DIR, "iris.csv")


## Task 1: Load and Explore the Dataset

In [None]:

# Create a local CSV from sklearn's Iris dataset (for reproducibility)
iris_bunch = load_iris(as_frame=True)
df_raw = iris_bunch.frame.copy()
df_raw.columns = [c.replace(" (cm)", "").replace(" ", "_") for c in df_raw.columns]
os.makedirs(DATA_DIR, exist_ok=True)
df_raw.to_csv(CSV_PATH, index=False)
print(f"Saved dataset to: {CSV_PATH}")


In [None]:

# Robust CSV reader with error handling
def load_dataset(path: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(path)
    except FileNotFoundError:
        print(f"[ERROR] File not found: {path}", file=sys.stderr); raise
    except pd.errors.EmptyDataError:
        print("[ERROR] The CSV appears to be empty.", file=sys.stderr); raise
    except Exception as e:
        print(f"[ERROR] Unexpected error while reading CSV: {e}", file=sys.stderr); raise
    return df

df = load_dataset(CSV_PATH)
df.head()


In [None]:

# Explore structure: data types and missing values
structure = pd.DataFrame({
    "dtype": df.dtypes.astype(str),
    "null_count": df.isnull().sum()
})
structure



### Cleaning: Handle Missing Values  
We fill numeric NaNs with the median and categorical NaNs with the mode (if any). The Iris dataset has no missing values, but the cleaning pipeline is implemented for robustness.


In [None]:

def clean_frame(frame: pd.DataFrame) -> pd.DataFrame:
    cleaned = frame.copy()
    # Numeric: fill with median
    for col in cleaned.select_dtypes(include=[np.number]).columns:
        if cleaned[col].isnull().any():
            cleaned[col] = cleaned[col].fillna(cleaned[col].median())
    # Non-numeric: fill with mode or 'Unknown'
    for col in cleaned.select_dtypes(exclude=[np.number]).columns:
        if cleaned[col].isnull().any():
            mode_vals = cleaned[col].mode(dropna=True)
            cleaned[col] = cleaned[col].fillna(mode_vals.iloc[0] if len(mode_vals) else "Unknown")
    return cleaned

df_clean = clean_frame(df)
df_clean.head()


## Task 2: Basic Data Analysis

In [None]:

# Descriptive statistics
desc = df_clean.describe().T
desc


In [None]:

# Grouping by species and computing means for numeric columns
# The raw CSV stores species as 'target' (0,1,2). Map to names for readability.
if df_clean["target"].dtype != object:
    mapping = {i: name for i, name in enumerate(iris_bunch.target_names)}
    df_clean["species"] = df_clean["target"].map(mapping)
else:
    df_clean["species"] = df_clean["target"]

grouped_means = (
    df_clean.groupby("species")[["sepal_length","sepal_width","petal_length","petal_width"]]
      .mean()
      .sort_index()
      .round(2)
)
grouped_means


In [None]:

# Quick findings / observations
corr = df_clean[["sepal_length","petal_length"]].corr().iloc[0,1]
longest_petal_species = grouped_means["petal_length"].idxmax()
print("Findings:")
print(f"- Positive correlation between sepal_length and petal_length (~ {corr:.2f}).")
print(f"- Species with longest average petals: {longest_petal_species} ({grouped_means.loc[longest_petal_species, 'petal_length']:.2f} cm).")
print(f"- Variability snapshot (std): sepal_length={desc.loc['sepal_length','std']:.2f}, petal_length={desc.loc['petal_length','std']:.2f}.")



## Task 3: Data Visualization  
We create four distinct plots using **matplotlib**. Each plot has a title and axis labels.


In [None]:

# 1) Line chart: Sepal length over sample index
plt.figure()
plt.plot(df_clean.index, df_clean["sepal_length"])
plt.title("Sepal Length over Sample Index")
plt.xlabel("Sample Index")
plt.ylabel("Sepal Length (cm)")
plt.show()


In [None]:

# 2) Bar chart: Average petal length per species
plt.figure()
plt.bar(grouped_means.index, grouped_means["petal_length"])
plt.title("Average Petal Length by Species")
plt.xlabel("Species")
plt.ylabel("Average Petal Length (cm)")
plt.show()


In [None]:

# 3) Histogram: Distribution of sepal length
plt.figure()
plt.hist(df_clean["sepal_length"], bins=15)
plt.title("Distribution of Sepal Length")
plt.xlabel("Sepal Length (cm)")
plt.ylabel("Frequency")
plt.show()


In [None]:

# 4) Scatter plot: Sepal length vs Petal length
plt.figure()
plt.scatter(df_clean["sepal_length"], df_clean["petal_length"])
plt.title("Sepal Length vs Petal Length")
plt.xlabel("Sepal Length (cm)")
plt.ylabel("Petal Length (cm)")
plt.show()


### (Optional) Save figures to files

In [None]:

# Save all four figures to /mnt/data for submission (if re-run, generate again)
# Note: We create fresh figures before saving.
# Line
plt.figure(); plt.plot(df_clean.index, df_clean["sepal_length"]); plt.title("Sepal Length over Sample Index"); plt.xlabel("Sample Index"); plt.ylabel("Sepal Length (cm)")
plt.savefig(os.path.join(DATA_DIR, "plot_line_sepal_length.png"), bbox_inches="tight")

# Bar
plt.figure(); plt.bar(grouped_means.index, grouped_means["petal_length"]); plt.title("Average Petal Length by Species"); plt.xlabel("Species"); plt.ylabel("Average Petal Length (cm)")
plt.savefig(os.path.join(DATA_DIR, "plot_bar_avg_petal_length.png"), bbox_inches="tight")

# Hist
plt.figure(); plt.hist(df_clean["sepal_length"], bins=15); plt.title("Distribution of Sepal Length"); plt.xlabel("Sepal Length (cm)"); plt.ylabel("Frequency")
plt.savefig(os.path.join(DATA_DIR, "plot_hist_sepal_length.png"), bbox_inches="tight")

# Scatter
plt.figure(); plt.scatter(df_clean["sepal_length"], df_clean["petal_length"]); plt.title("Sepal Length vs Petal Length"); plt.xlabel("Sepal Length (cm)"); plt.ylabel("Petal Length (cm)")
plt.savefig(os.path.join(DATA_DIR, "plot_scatter_sepal_vs_petal.png"), bbox_inches="tight")

print("Saved figures to:", DATA_DIR)



## Submission Notes
- Dataset: `iris.csv` (auto-saved to `/mnt/data/iris.csv`)
- Include this notebook and the generated figures in your submission.
- All plots include titles and axis labels; analysis demonstrates `describe()`, grouping, and key observations.
