In [17]:
%%writefile resolver.py
from __future__ import annotations
from dataclasses import dataclass
from typing import Optional
import math
import pandas as pd

@dataclass
class QueryPoint:
    lat: float
    lon: float

def geocode(query: str, user_agent: str = "epw-catalog") -> Optional[QueryPoint]:
    # Import here so it works even if you install geopy later in the session
    from geopy.geocoders import Nominatim
    geocoder = Nominatim(user_agent=user_agent, timeout=10)
    loc = geocoder.geocode(query)
    if not loc:
        return None
    return QueryPoint(lat=loc.latitude, lon=loc.longitude)

def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0088  # km
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    dphi = math.radians(lat2 - lat1)
    dlmb = math.radians(lon2 - lon1)
    a = math.sin(dphi/2)**2 + math.cos(phi1)*cos(phi2)*math.sin(dlmb/2)**2
    return 2*R*math.asin(math.sqrt(a))

def nearest_from_catalog(catalog_csv: str, q: QueryPoint, k: int = 8) -> pd.DataFrame:
    df = pd.read_csv(catalog_csv)
    if {"lat","lon"}.issubset(df.columns):
        ok = df.dropna(subset=["lat","lon"]).copy()
    else:
        ok = df.copy(); ok["lat"] = float("nan"); ok["lon"] = float("nan")
    ok["distance_km"] = [
        haversine(q.lat, q.lon, la, lo) if pd.notna(la) and pd.notna(lo) else float("inf")
        for la, lo in zip(ok["lat"], ok["lon"])
    ]
    ok = ok.sort_values("distance_km").head(k)
    cols = ["name","country","kind","years","epw_url","zip_url","distance_km"]
    return ok[[c for c in cols if c in ok.columns]]


Overwriting resolver.py


In [5]:
import pandas as pd

sample = pd.DataFrame([
    dict(name="CGS.Buffalo.725285", country="USA",
         lat=42.940, lon=-78.730, kind="TMYx", years="2009-2023",
         epw_url="https://climate.onebuilding.org/North%20and%20Central%20America/USA/NY/USA_NY_CGS.Buffalo.725285_TMYx.2009-2023.epw",
         zip_url=None, source="OneBuilding"),
    dict(name="CT.NewHaven.725090", country="USA",
         lat=41.31, lon=-72.92, kind="TMY3", years="1991-2005",
         epw_url=None,
         zip_url="https://climate.onebuilding.org/North%20and%20Central%20America/USA/CT/USA_CT_New.Haven.725090_TMY3.zip",
         source="OneBuilding"),
    dict(name="New.Delhi.421820", country="IND",
         lat=28.61, lon=77.21, kind="TMYx", years="2009-2023",
         epw_url=None,
         zip_url="https://climate.onebuilding.org/Asia/IND/IND_New.Delhi.421820_TMYx.2009-2023.zip",
         source="OneBuilding"),
])
sample_path = "sample_catalog.csv"
sample.to_csv(sample_path, index=False)
sample_path


'sample_catalog.csv'

In [6]:
from resolver import QueryPoint, nearest_from_catalog

catalog = "sample_catalog.csv"
q = QueryPoint(lat=41.31, lon=-72.92)  # New Haven approx
nearest = nearest_from_catalog(catalog, q, k=5)
nearest


Unnamed: 0,name,country,kind,years,epw_url,zip_url,distance_km
1,CT.NewHaven.725090,USA,TMY3,1991-2005,,https://climate.onebuilding.org/North%20and%20...,0.0
0,CGS.Buffalo.725285,USA,TMYx,2009-2023,https://climate.onebuilding.org/North%20and%20...,,512.154956
2,New.Delhi.421820,IND,TMYx,2009-2023,,https://climate.onebuilding.org/Asia/IND/IND_N...,11655.12546


In [12]:
# 1) Which Python is this notebook using?
import sys, subprocess, pkgutil
print(sys.executable)


C:\Users\Shyamli\AppData\Local\Programs\Python\Python312\python.exe


In [13]:
# 2) Install geopy into THAT interpreter
import sys
!{sys.executable} -m pip install -q geopy



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
# 3) Verify install succeeded
import importlib, sys
try:
    import geopy
    print("geopy version:", geopy.__version__)
except Exception as e:
    print("Import failed:", e)


geopy version: 2.4.1


In [18]:
from resolver import geocode, nearest_from_catalog
q = geocode("New Haven, CT", user_agent="ksingh-epw-tool-1")
nearest_from_catalog("sample_catalog.csv", q, k=5)


Unnamed: 0,name,country,kind,years,epw_url,zip_url,distance_km
1,CT.NewHaven.725090,USA,TMY3,1991-2005,,https://climate.onebuilding.org/North%20and%20...,0.466361
0,CGS.Buffalo.725285,USA,TMYx,2009-2023,https://climate.onebuilding.org/North%20and%20...,,511.842156
2,New.Delhi.421820,IND,TMYx,2009-2023,,https://climate.onebuilding.org/Asia/IND/IND_N...,11655.493426


In [28]:
%%writefile download_epw.py
from __future__ import annotations
from pathlib import Path
from typing import Optional
import os, zipfile
import requests

HDRS = {"User-Agent": "epw-downloader/1.0"}

def _stream_download(url: str, dest: Path, chunk: int = 1<<14, timeout: int = 60) -> Path:
    dest.parent.mkdir(parents=True, exist_ok=True)
    with requests.get(url, stream=True, timeout=timeout, headers=HDRS) as r:
        if r.status_code == 404:
            raise requests.HTTPError(f"404 for {url}", response=r)
        r.raise_for_status()
        with open(dest, "wb") as f:
            for b in r.iter_content(chunk_size=chunk):
                if b: f.write(b)
    return dest

def _pick_epw_from_zip(zip_path: Path) -> Path:
    with zipfile.ZipFile(zip_path, "r") as z:
        epws = [m for m in z.namelist() if m.lower().endswith(".epw")]
        if not epws:
            raise ValueError(f"No .epw found inside {zip_path.name}")
        epws.sort(key=lambda m: z.getinfo(m).file_size, reverse=True)
        member = epws[0]
        out = zip_path.parent / Path(member).name
        z.extract(member, path=zip_path.parent)
        extracted = zip_path.parent / member
        if extracted != out:
            if out.exists(): out.unlink()
            os.replace(extracted, out)
        return out

def _try_zip_variant(url: str) -> Optional[str]:
    if url.lower().endswith(".epw"):
        return url[:-4] + ".zip"
    return None

def download_epw(url: str, out_dir: str | Path = "weather_cache", filename: Optional[str] = None,
                 timeout: int = 60) -> Path:
    out_dir = Path(out_dir)
    if filename is None:
        filename = url.split("/")[-1].split("?")[0] or "download.bin"
    target = out_dir / filename
    try:
        _stream_download(url, target, timeout=timeout)
    except requests.HTTPError:
        alt = _try_zip_variant(url)
        if not alt:
            raise
        target = out_dir / (Path(filename).stem + ".zip")
        _stream_download(alt, target, timeout=timeout)
    # Return EPW
    if target.suffix.lower() == ".zip" or zipfile.is_zipfile(target):
        return _pick_epw_from_zip(target)
    return target


Overwriting download_epw.py


In [29]:
from resolver import QueryPoint, nearest_from_catalog
from download_epw import download_epw
from epw_loader import read_epw  # the tiny loader we made earlier

# A. Find nearby datasets (using the sample catalog)
catalog = "sample_catalog.csv"    # make sure this file is next to your notebook
q = QueryPoint(lat=41.31, lon=-72.92)  # New Haven approx
near = nearest_from_catalog(catalog, q, k=5)
near


Unnamed: 0,name,country,kind,years,epw_url,zip_url,distance_km
1,CT.NewHaven.725090,USA,TMY3,1991-2005,,https://climate.onebuilding.org/North%20and%20...,0.0
0,CGS.Buffalo.725285,USA,TMYx,2009-2023,https://climate.onebuilding.org/North%20and%20...,,512.154956
2,New.Delhi.421820,IND,TMYx,2009-2023,,https://climate.onebuilding.org/Asia/IND/IND_N...,11655.12546


In [30]:
# B. Choose a URL (prefer epw_url; if missing, use zip_url)
if "epw_url" in near and near["epw_url"].notna().any():
    url = near["epw_url"].dropna().iloc[0]
else:
    url = near["zip_url"].dropna().iloc[0]

url


'https://climate.onebuilding.org/North%20and%20Central%20America/USA/NY/USA_NY_CGS.Buffalo.725285_TMYx.2009-2023.epw'

In [31]:
row = near.dropna(subset=["zip_url"]).iloc[0]
url = row["zip_url"]
print("Using URL:", url)

local_epw = download_epw(url, out_dir="weather_cache")
print("Saved to:", local_epw)

header, df = read_epw(local_epw)
header["location"], df.shape, (df.index.min(), df.index.max())


Using URL: https://climate.onebuilding.org/North%20and%20Central%20America/USA/CT/USA_CT_New.Haven.725090_TMY3.zip


HTTPError: 404 Client Error: Not Found for url: https://climate.onebuilding.org/North%20and%20Central%20America/USA/CT/USA_CT_New.Haven.725090_TMY3.zip

In [32]:
import requests, pandas as pd

def check(url):
    try:
        r = requests.get(url, stream=True, timeout=30,
                         headers={"User-Agent":"Mozilla/5.0",
                                  "Referer":"https://climate.onebuilding.org/"})
        return r.status_code, r.url, [h.status_code for h in r.history]
    except Exception as e:
        return str(e), None, None

# near = output from nearest_from_catalog(...)
candidates = []
if "epw_url" in near:
    candidates += list(near["epw_url"].dropna())
if "zip_url" in near:
    candidates += list(near["zip_url"].dropna())

pd.DataFrame([{"url":u, "status":check(u)[0], "final_url":check(u)[1], "history":check(u)[2]} for u in candidates])


Unnamed: 0,url,status,final_url,history
0,https://climate.onebuilding.org/North%20and%20...,404,https://climate.onebuilding.org/North%20and%20...,[]
1,https://climate.onebuilding.org/North%20and%20...,404,https://climate.onebuilding.org/North%20and%20...,[]
2,https://climate.onebuilding.org/Asia/IND/IND_N...,404,https://climate.onebuilding.org/Asia/IND/IND_N...,[]
