# SM2 Public Dataset – Demo Notebook

This notebook works in **Google Colab** and in **local Jupyter**.

If you run locally, make sure you have these packages installed (Colab already has them):

```bash
pip install gdown pandas matplotlib pyarrow fastparquet
```


In [None]:
# Optional (for local Jupyter): install dependencies
try:
    import gdown, pyarrow  # noqa
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", 
                           "gdown", "pandas", "matplotlib", "pyarrow", "fastparquet"])

In [None]:
import gdown
import pandas as pd
import matplotlib.pyplot as plt

# Google Drive ID for the Parquet file
file_id = "1gLPWgUGtRb371Gpv5O8t5j95lthNjELg"
url = f"https://drive.google.com/uc?id={file_id}"
output = "sm2_public_dataset.parquet"

# Download
gdown.download(url, output, quiet=False)

# Load
df = pd.read_parquet(output)

# Types & basic info
print("Shape:", df.shape)
print(df.info())
df["time"] = pd.to_datetime(df["time"], utc=True)
print("Time range:", df["time"].min(), "→", df["time"].max())

# Distinct metrics
print("\nDistinct data_key:")
print(df["data_key"].unique())

# Combo overview
agg_table = (
    df.groupby(["location", "source", "data_key"]).size()
      .reset_index(name="rows")
      .sort_values("rows", ascending=False)
)
print("\nTop 20 combinations by row count:")
print(agg_table.head(20))

# Monthly trend by data_key
df["month"] = df["time"].dt.to_period("M")
trend_table = (
    df.groupby(["month", "data_key"]).size()
      .reset_index(name="rows")
      .pivot(index="month", columns="data_key", values="rows")
      .fillna(0)
      .sort_index()
)
print("\nMonthly distribution by data_key:")
print(trend_table.head(12))

# Plot
trend_table.plot(kind="line", figsize=(12,6))
plt.title("Number of records per data_key over time")
plt.ylabel("Row count")
plt.xlabel("Month")
plt.legend(title="data_key")
plt.show()