## 325 - Adding Fallback Data and Cleanup to a Local Data Cache with Python

[Youtube](https://www.youtube.com/watch?v=DpFANOLqvV4)

In [1]:
from datetime import datetime, timedelta
from pathlib import Path
import pandas as pd
import os
import time
from siphon.simplewebservice.iastate import IAStateUpperAir
from siphon.simplewebservice.wyoming import WyomingUpperAir

In [2]:
def get_filename(date: datetime, station: str) -> str:
    """
    Create a filename in the format YYYYMMDDHH_STN.txt
    """
    return f"{date.strftime('%Y%m%d%H')}_{station}.txt"

In [3]:
def save_to_cache(df: pd.DataFrame, file_path: Path):
    """
    Saves the pandas DataFrame to a text file CSV.
    """
    df.to_csv(file_path, index = False)
    print(f'Data saved to cache: {file_path}')

In [4]:
def load_from_cache(file_path: Path) -> pd.DataFrame:
    """
    Load a pandas DataFrame from the saved cache CSV file.
    """
    print(f'Loading from cache: {file_path}')
    return pd.read_csv(file_path)

In [5]:
def fetch_sounding(date: datetime, station: str) -> pd.DataFrame:
    """
    Get sounding data from the remote sources trying Wyoming and Iowa.
    """
    try:
        return WyomingUpperAir.request_data(date, station)
    except Exception as wyoming_err:
        print(f'Wyoming failed: {wyoming_err}')
        try:
            return IAStateUpperAir.request_data(date, station)
        except Exception as iastate_err:
            print(f'Iowa Failed: {iastate_err}')
            raise RuntimeError('Failed to get sounding')

In [6]:
def should_run_cleanup(cache_dir: Path) -> bool:
    """
    Determines if cleanup needs to be run, will only run once/day.
    """
    marker_file = cache_dir / '.cleanup_marker'
    if not marker_file.exists():
        return True
    last_run = datetime.fromtimestamp(marker_file.stat().st_mtime)
    return (datetime.now() - last_run).days >= 1

In [7]:
def cleanup_old_cache(cache_dir: Path, expiration_days: int = 30):
    now = time.time()
    expiration_in_seconds = expiration_days * 86400

    for file in cache_dir.glob('*.txt'):
        last_access = os.stat(file).st_atime
        if now - last_access > expiration_in_seconds:
            file.unlink()
    
    marker_file = cache_dir / '.cleanup_marker'
    marker_file.touch()

In [8]:
def get_sounding_data(date: datetime, station: str, cache_dir: Path) -> pd.DataFrame:
    """
    Get sounding data from cache or remote source. If remote, store it locally.
    """
    # Make sure cache exists or create it
    if not cache_dir.exists():
        cache_dir.mkdir(parents=True, exist_ok=True)
        print(f'Created cache directory: {cache_dir}')

    # Run cleanup if needed
    if should_run_cleanup(cache_dir):
        print('Running cleanup')
        cleanup_old_cache(cache_dir)
    
    # Get the data
    filename = get_filename(date, station)
    file_path = cache_dir / filename

    if file_path.exists():
        print('Loading Existing')
        df = load_from_cache(file_path)
    else:
        print('Downloading new data')
        df = fetch_sounding(date, station)
        save_to_cache(df, file_path)
    return df

In [9]:
cache_directory = Path('sounding_data')

In [10]:
df_sounding = get_sounding_data(datetime(2014, 11, 8, 12), 'OUN', cache_directory)

Running cleanup
Loading Existing
Loading from cache: sounding_data/2014110812_OUN.txt


In [11]:
df_sounding.head()

Unnamed: 0,pressure,height,temperature,dewpoint,direction,speed,u_wind,v_wind,station,station_number,time,latitude,longitude,elevation,pw
0,977.0,345,15.0,7.0,5.0,10.0,-0.871557,-9.961947,OUN,72357,2014-11-08 12:00:00,35.18,-97.44,345.0,20.07
1,975.0,362,14.8,6.8,5.0,11.0,-0.958713,-10.958142,OUN,72357,2014-11-08 12:00:00,35.18,-97.44,345.0,20.07
2,946.8,610,13.1,4.6,5.0,30.0,-2.614672,-29.885841,OUN,72357,2014-11-08 12:00:00,35.18,-97.44,345.0,20.07
3,925.0,807,11.8,2.8,15.0,39.0,-10.093943,-37.671107,OUN,72357,2014-11-08 12:00:00,35.18,-97.44,345.0,20.07
4,913.2,914,10.8,2.8,25.0,45.0,-19.017822,-40.78385,OUN,72357,2014-11-08 12:00:00,35.18,-97.44,345.0,20.07
