In [1]:
import sys
import os
import glob

In [2]:
sys.path.append(os.path.abspath('..'))
from src.preprocessing import load_data, clean_column_names, parse_dates, enforce_hourly_index
from src.features import add_time_features, add_lag_features, fill_missing_lags

In [3]:
RAW_DIR = '../data/raw/'
PROCESSED_DIR = '../data/processed/'
os.makedirs(PROCESSED_DIR, exist_ok=True)

In [4]:
site_files = sorted(glob.glob(os.path.join(RAW_DIR, 'site_*_train_data.csv')))
print(f"Found {len(site_files)} files to process.")

Found 7 files to process.


In [5]:
def process_and_save(filepath):
    
    # 1. Extract Site Name 
    filename = os.path.basename(filepath)
    site_name = filename.split('_train')[0]
    
    print(f"Processing {site_name}...", end=" ")
    
    # 2. Load & Clean
    df = load_data(filepath)
    df = clean_column_names(df)
    df = parse_dates(df)
    df = enforce_hourly_index(df)
    
    # 3. Feature Engineering
    df = add_time_features(df)
    df = add_lag_features(df, lags=[1, 24])
    df = fill_missing_lags(df)
    
    # 4. Drop Satellite Columns
    cols_to_drop = ['NO2_satellite', 'HCHO_satellite', 'ratio_satellite']
    df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])
    
    # 5. Final Cleanup
    df_final = df.dropna(subset=['O3_target', 'NO2_target'])
    
    # 6. Save
    output_path = os.path.join(PROCESSED_DIR, f'{site_name}_hourly_features.csv')
    df_final.to_csv(output_path)
    
    print(f"Saved to {output_path} (Shape: {df_final.shape})")

In [6]:
for filepath in site_files:
    try:
        process_and_save(filepath)
    except Exception as e:
        print(f"\nError processing {filepath}: {e}")

print("\nAll sites processed successfully.")

Processing site_1... Saved to ../data/processed/site_1_hourly_features.csv (Shape: (25081, 19))
Processing site_2... Saved to ../data/processed/site_2_hourly_features.csv (Shape: (25969, 19))
Processing site_3... Saved to ../data/processed/site_3_hourly_features.csv (Shape: (21913, 19))
Processing site_4... Saved to ../data/processed/site_4_hourly_features.csv (Shape: (24505, 19))
Processing site_5... Saved to ../data/processed/site_5_hourly_features.csv (Shape: (25081, 19))
Processing site_6... Saved to ../data/processed/site_6_hourly_features.csv (Shape: (26353, 19))
Processing site_7... Saved to ../data/processed/site_7_hourly_features.csv (Shape: (22777, 19))

All sites processed successfully.
