In [None]:
from io import BytesIO
from zipfile import ZipFile
import datetime
from datetime import date, timedelta
import os
import requests
import pandas as pd
import numpy as np
import time
import socket

In [None]:
output_dir = "processed"

# Get last completed date
existing_files = sorted([
    f for f in os.listdir(output_dir)
    if f.endswith(".parquet")
])

if existing_files:
    last_file = existing_files[-1] 
    last_date = date.fromisoformat(last_file.replace(".parquet", "").replace("_", "-"))
    start = last_date + timedelta(days=1)
else:
    start = date(2023, 1, 1) 

end = date(2023, 12, 31) 

date_range = [start + timedelta(days=i) for i in range((end - start).days + 1)]

for d in date_range:
    try:
        fname = f"AIS_2023_{d.month:02d}_{d.day:02d}.zip"
        url = f"https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2023/{fname}"
        print(f"Processing {url}")

        #stupid stuff bc my internet drops
        MAX_RETRIES = 10
        RETRY_WAIT = 60  # seconds
        success = False
        for attempt in range(MAX_RETRIES):
            try:
                r = requests.get(url, timeout=60)
                if r.status_code == 200:
                    with ZipFile(BytesIO(r.content)) as z:
                        csv_name = z.namelist()[0]
                        with z.open(csv_name) as f:
                            df = pd.read_csv(f)
                    success = True
                    break
                else:
                    print(f"Bad response ({r.status_code}). Retrying...")
            except (requests.exceptions.RequestException, socket.timeout) as e:
                print(f"Connection failed: {e}. Retrying in {RETRY_WAIT} seconds...")

            time.sleep(RETRY_WAIT)

        if not success:
            print(f"Failed to fetch data for {d} after {MAX_RETRIES} retries. Skipping...")
            continue

        # Clean
        df = df[df.TransceiverClass == 'A']                     #large cargo ships, not personal vessels
        df = df[(df.SOG > 1) & (df.SOG < 80)]                   #drops ships in harbor, bad data
        df = df[(df.Length > 30) & (df.Length < 400)]           #drops small ships, bad data
        df = df.replace({'Heading': {511: np.nan}})             #heading 511 is nonrespondor, set to nan
        df = df[(df["LAT"] >= 21.0) & (df["LAT"] <= 31.0) &     #filter to data approx in gulf (could be smarter)
                (df["LON"] >= -97.0) & (df["LON"] <= -81.0)]

        # Sort
        df = df.sort_values(by=['MMSI', 'BaseDateTime']).reset_index(drop=True)

        # Drop weak tracks
        mmsi_counts = df.MMSI.value_counts()
        active = mmsi_counts[mmsi_counts >= 5].index
        df = df[df["MMSI"].isin(active)].reset_index(drop=True)

        # Save
        if len(df):
            out_path = f"processed/{d.strftime('%Y_%m_%d')}.parquet"
            df.to_parquet(out_path, index=False)
            print(f"Saved {len(df)} rows to {out_path}")
        else:
            print("No valid rows; skipping save.")

    except Exception as e:
        print(f"Failed on {d}: {e}")
