In [1]:
import pandas as pd
from pathlib import Path

df = pd.read_csv("data/interim/part1_task1.csv")
stations_ok = pd.read_csv("data/interim/stations_ok.csv") if Path("data/interim/stations_ok.csv").exists() else None

# If stations_ok exists, filter rows to those station_numbers, otherwise keep stations 725300–725330
if stations_ok is not None:
    ok_set = set(stations_ok["station_number"])
    df = df[df["station_number"].isin(ok_set)].copy()
else:
    
    df = df[(df["station_number"]>=725300)&(df["station_number"]<=725330)].copy()

# Build a proper datetime column
df["date"] = pd.to_datetime(df["date_ymd"], errors="coerce")
df = df.sort_values(["station_number","date"])
df["snow"] = df["snow"].astype(bool)
df.head()


Unnamed: 0,station_number,wban_number,year,month,day,date_ymd,mean_temp,min_temperature,max_temperature,total_precipitation,snow_depth,snow,rain,fog,hail,thunder,tornado,date
322743,725300,94846,2000,1,1,2000-01-01,38.400002,,26.1,0.0,,False,False,False,False,False,False,2000-01-01
85832,725300,94846,2000,1,2,2000-01-02,47.799999,,36.0,0.01,,True,True,True,True,True,True,2000-01-02
322904,725300,94846,2000,1,3,2000-01-03,37.5,,35.099998,0.01,,True,True,True,True,True,True,2000-01-03
163401,725300,94846,2000,1,4,2000-01-04,29.700001,,26.1,0.25,1.2,True,True,True,True,True,True,2000-01-04
47233,725300,94846,2000,1,5,2000-01-05,20.4,,12.2,0.0,,True,True,True,True,True,True,2000-01-05


In [2]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

today = datetime.today()
test_date = today - relativedelta(years=20) + timedelta(days=1)
test_date_str = test_date.strftime("%Y-%m-%d")
test_date, test_date_str


(datetime.datetime(2005, 8, 30, 20, 56, 4, 391953), '2005-08-30')

In [3]:

# Create label for will it snow tomorrow?
df = df.sort_values(["station_number","date"])
df["y_tomorrow"] = df.groupby("station_number")["snow"].shift(-1).astype("float")  # stays NaN on the last date per station
df[["station_number","date_ymd","snow","y_tomorrow"]].head()

Unnamed: 0,station_number,date_ymd,snow,y_tomorrow
322743,725300,2000-01-01,False,1.0
85832,725300,2000-01-02,True,1.0
322904,725300,2000-01-03,True,1.0
163401,725300,2000-01-04,True,1.0
47233,725300,2000-01-05,True,0.0


In [4]:
feature_cols = [
    "snow_depth","total_precipitation","mean_temp","min_temperature","max_temperature",
    "rain","fog","hail","thunder","tornado"
]

# Cast numeric to numeric where applicable and booleans to bool
for c in ["snow_depth","total_precipitation","mean_temp","min_temperature","max_temperature"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")
for c in ["rain","fog","hail","thunder","tornado"]:
    if c in df.columns:
        df[c] = df[c].astype(bool)


In [5]:
# Masks for splits
test_mask = (df["date"].dt.strftime("%Y-%m-%d") == test_date_str)
test_df = df[test_mask].copy()

pre_test = df[(~test_mask) & (df["date"] < pd.to_datetime(test_date_str))].copy()

# Validation window size (60 days)
val_window_days = 60
val_start = pd.to_datetime(test_date_str) - pd.Timedelta(days=val_window_days)

valid_df = pre_test[pre_test["date"] > val_start].copy()
train_df = pre_test[pre_test["date"] <= val_start].copy()

train_df.shape, valid_df.shape, test_df.shape


((20024, 19), (590, 19), (10, 19))

In [6]:
# No overlap
assert set(train_df.index).isdisjoint(valid_df.index), "Train and valid sets overlap"
assert set(train_df.index).isdisjoint(test_df.index),  "Train and test sets overlap"
assert set(valid_df.index).isdisjoint(test_df.index),  "Valid and test sets overlap"

# Test date check (robust)
if len(test_df):
    dates_unique = set(test_df["date"].dt.strftime("%Y-%m-%d").unique())
    assert dates_unique == {test_date_str}, f"Unexpected test dates: {sorted(dates_unique)}"

# Label presence summary
print(
    "NaN fraction of y_tomorrow:",
    "train", train_df["y_tomorrow"].isna().mean() if "y_tomorrow" in train_df else None,
    "valid", valid_df["y_tomorrow"].isna().mean() if "y_tomorrow" in valid_df else None,
    "test",  test_df["y_tomorrow"].isna().mean()  if "y_tomorrow" in test_df  else None
)


NaN fraction of y_tomorrow: train 0.0 valid 0.0 test 0.0


In [7]:
Path("data/processed").mkdir(parents=True, exist_ok=True)

keep_cols = ["station_number","date_ymd","date","y_tomorrow"] + feature_cols
(train_df[keep_cols]).to_csv("data/processed/train.csv", index=False)
(valid_df[keep_cols]).to_csv("data/processed/valid.csv", index=False)
(test_df[keep_cols]).to_csv("data/processed/test.csv", index=False)

print("Saved:",
      "data/processed/train.csv",
      "data/processed/valid.csv",
      "data/processed/test.csv", sep="\n")


Saved:
data/processed/train.csv
data/processed/valid.csv
data/processed/test.csv
