Load and unzip taxidata

In [1]:
from pathlib import Path
from zipfile import ZipFile
import pandas as pd
import pickle
from kaggle.api.kaggle_api_extended import KaggleApi

def load_weather_data():
    dataset_slug = "pschale/nyc-taxi-wunderground-weather"
    zip_name = "nyc-taxi-wunderground-weather.zip"
    csv_name = "weatherdata.csv"

    data_dir = Path("data")
    package_dir = Path("data_packages")

    zip_path = package_dir / zip_name
    csv_path = data_dir / csv_name
    pkl_path = data_dir / "weatherdata.pkl"

    # Schritt 0: Cache verwenden
    if pkl_path.is_file():
        with open(pkl_path, "rb") as f:
            return pickle.load(f)

    # Schritt 1: ZIP herunterladen, wenn sie nicht existiert
    if not zip_path.is_file():
        package_dir.mkdir(parents=True, exist_ok=True)
        api = KaggleApi()
        api.authenticate()
        api.dataset_download_files(dataset_slug, path=str(package_dir), unzip=False)

    # Schritt 2: Entpacken, falls CSV noch nicht existiert
    if not csv_path.is_file():
        data_dir.mkdir(parents=True, exist_ok=True)
        with ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extract(csv_name, path=data_dir)

    # Schritt 3: CSV laden und Cache schreiben
    if not csv_path.is_file():
        raise FileNotFoundError(f"{csv_path} wurde nicht gefunden – Entpackung fehlgeschlagen.")

    df = pd.read_csv(csv_path)
    with open(pkl_path, "wb") as f:
        pickle.dump(df, f)

    return df

# Anwendung
weather_df = load_weather_data()



Dataset URL: https://www.kaggle.com/datasets/pschale/nyc-taxi-wunderground-weather
