# COVID-19 Global Data Tracker — Analysis with Pandas & Matplotlib

This notebook demonstrates data loading, exploration, basic analysis, and visualizations using **pandas** and **matplotlib**.

It follows the assignment requirements:
- **Task 1:** Load & Explore (head, dtypes, missing values, cleaning)
- **Task 2:** Basic Analysis (describe + groupby)
- **Task 3:** Visualizations (line, bar, histogram, scatter)

> Set `CSV_PATH` to your CSV file; otherwise, it falls back to the Iris dataset (or a synthetic iris-like dataset if `sklearn` is unavailable).

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from IPython.display import display
pd.set_option('display.max_columns', None)

## Task 1 — Load & Explore the Dataset
Set `CSV_PATH` to your CSV file (e.g., a COVID-19 timeseries). If `CSV_PATH` is `None`, we will load **Iris** from scikit-learn, or generate a small synthetic dataset if needed.

In [None]:
CSV_PATH = None  # e.g., Path('data/owid.csv')

def load_dataset(csv_path=None):
    if csv_path:
        try:
            df = pd.read_csv(csv_path)
            return df, f"CSV: {csv_path}"
        except FileNotFoundError:
            print("File not found; falling back to iris dataset.")
        except pd.errors.EmptyDataError:
            print("Empty CSV; falling back to iris dataset.")
        except Exception as e:
            print(f"Error reading CSV ({e}); falling back to iris dataset.")

    # Try Iris from sklearn
    try:
        from sklearn.datasets import load_iris
        iris = load_iris(as_frame=True)
        df = iris.frame.copy()
        if 'target' in df.columns:
            df['species'] = df['target'].map(dict(enumerate(iris.target_names)))
            df.drop(columns=['target'], inplace=True)
        return df, 'iris (sklearn)'
    except Exception:
        pass

    # Synthetic iris-like fallback
    rng = np.random.default_rng(42)
    n = 150
    df = pd.DataFrame({
        'sepal length (cm)': rng.normal(5.8, 0.8, n),
        'sepal width (cm)':  rng.normal(3.0, 0.4, n),
        'petal length (cm)': rng.normal(3.7, 1.5, n),
        'petal width (cm)':  rng.normal(1.1, 0.5, n),
        'species': rng.choice(['setosa','versicolor','virginica'], n)
    })
    return df, 'synthetic iris-like'

df, DATA_SOURCE = load_dataset(CSV_PATH)
print('Data source:', DATA_SOURCE)
display(df.head())

### Explore structure and missing values

In [None]:
display(df.dtypes)
display(df.isna().sum())

### Clean the dataset
Fill numeric missing values with **median** and categorical with **mode**. Drop rows that are completely NA.

In [None]:
df_clean = df.copy()
num_cols = df_clean.select_dtypes(include=[np.number]).columns
cat_cols = df_clean.select_dtypes(include=['object','category']).columns

for c in num_cols:
    if df_clean[c].isna().any():
        df_clean[c] = df_clean[c].fillna(df_clean[c].median())

for c in cat_cols:
    if df_clean[c].isna().any():
        mode = df_clean[c].mode(dropna=True)
        fill = mode.iloc[0] if not mode.empty else 'Unknown'
        df_clean[c] = df_clean[c].fillna(fill)

df_clean = df_clean.dropna(how='all')
display(df_clean.head())

## Task 2 — Basic Data Analysis
Compute descriptive statistics and a grouped mean over a categorical column.

In [None]:
# Descriptive statistics for numeric columns
display(df_clean.describe(numeric_only=True))

# Choose a categorical column (prefer 'species'); otherwise pick any object/category column,
# or create bins from the first numeric column
cat_col = 'species' if 'species' in df_clean.columns else None
if cat_col is None:
    for c in df_clean.columns:
        if pd.api.types.is_categorical_dtype(df_clean[c]) or pd.api.types.is_object_dtype(df_clean[c]):
            cat_col = c
            break
if cat_col is None:
    first_num = df_clean.select_dtypes(include=[np.number]).columns[0]
    cat_col = f"{first_num}_bin"
    df_clean[cat_col] = pd.qcut(df_clean[first_num], q=3, labels=['low','mid','high'])

num_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
num_a = num_cols[0]
num_b = num_cols[1] if len(num_cols) > 1 else num_cols[0]

group_means = df_clean.groupby(cat_col)[num_a].mean().sort_values(ascending=False)
display(group_means)
print(f'Grouping column: {cat_col} | Numeric A: {num_a} | Numeric B: {num_b}')
print(f'Top group by mean {num_a}: {group_means.index[0]} -> {group_means.iloc[0]:.3f}')

## Task 3 — Data Visualization (Matplotlib)
Four plots: **line**, **bar**, **histogram**, **scatter**.
> Per instructions, we use matplotlib only; you may add seaborn styling if you like.

In [None]:
def ensure_time_index(df):
    for c in df.columns:
        if pd.api.types.is_datetime64_any_dtype(df[c]):
            return pd.to_datetime(df[c])
    return pd.date_range(start='2020-01-01', periods=len(df), freq='D')

dates = ensure_time_index(df_clean)
series = pd.Series(df_clean[num_a].values, index=dates).rolling(window=7, min_periods=1).mean()

plt.figure()
series.plot()
plt.title(f'Trend of {num_a} over time (7-day rolling mean)')
plt.xlabel('Date')
plt.ylabel(num_a)
plt.tight_layout()
plt.show()

In [None]:
# Bar chart: mean of num_a by category
plt.figure()
df_clean.groupby(cat_col)[num_a].mean().sort_values().plot(kind='bar')
plt.title(f'Average {num_a} by {cat_col}')
plt.xlabel(cat_col)
plt.ylabel(f'Mean {num_a}')
plt.tight_layout()
plt.show()

In [None]:
# Histogram: distribution of num_a
plt.figure()
plt.hist(df_clean[num_a].values, bins=20)
plt.title(f'Distribution of {num_a}')
plt.xlabel(num_a)
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Scatter: num_a vs num_b
plt.figure()
plt.scatter(df_clean[num_a].values, df_clean[num_b].values)
plt.title(f'{num_a} vs {num_b}')
plt.xlabel(num_a)
plt.ylabel(num_b)
plt.tight_layout()
plt.show()

## Findings / Observations
- Add 2–4 short bullet points with your observations.
- Example: The 7-day rolling mean shows clear waves.
- Example: One group has the highest average of the selected metric.