In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# \ud83d\udd2a HOMOGEN Pipeline Testing Notebook\n",
    "This notebook helps validate parsing, harmonization, and validation correctness for the HOMOGEN pipeline."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# \ud83d\udcc1 Load groundwater harmonized dataset\n",
    "import pandas as pd\n",
    "\n",
    "groundwater_path = \"../data/harmonized/groundwater.parquet\"\n",
    "df = pd.read_parquet(groundwater_path)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# \ud83d\udd0d Check final schema\n",
    "df.columns.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# \ud83d\udcca Visualize nitrate over time\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "nitrate_df = df[df[\"measurement_parameter\"] == \"Nitrat\"].copy()\n",
    "nitrate_df = nitrate_df.sort_values(\"collection_date\")\n",
    "\n",
    "plt.figure(figsize=(10, 5))\n",
    "plt.plot(nitrate_df[\"collection_date\"], nitrate_df[\"measurement_value\"])\n",
    "plt.title(\"Nitrate Over Time\")\n",
    "plt.xlabel(\"Date\")\n",
    "plt.ylabel(\"mg/L\")\n",
    "plt.grid()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# \u26a0\ufe0f Validation flags distribution\n",
    "df[\"validation_flags\"].value_counts().head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# \ud83d\udd3b Records with low quality\n",
    "df[df[\"quality_score\"] < 0.6][[\"measurement_parameter\", \"measurement_value\", \"quality_score\", \"validation_flags\"]].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# \ud83d\udecd Missing geometry check\n",
    "missing_geo = df[df[\"geometry\"].isna()]\n",
    "print(f\"Missing geometry: {len(missing_geo)} rows\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\u2705 Use this notebook to visually and statistically inspect the outputs of the HOMOGEN harmonization process.\n",
    "You can duplicate these cells for the `metadata.parquet` file too if needed."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": ""
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}