In [18]:
# Imports
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
)


In [19]:
# Load train/validation/test splits produced in Task 3
train_data = pd.read_csv("data/processed/train.csv")
valid_data = pd.read_csv("data/processed/valid.csv")
test_data  = pd.read_csv("data/processed/test.csv")

# Extract anchored test date
test_date_str = pd.to_datetime(test_data["date_ymd"]).dt.strftime("%Y-%m-%d").unique().tolist()
assert len(test_date_str) == 1, f"Unexpected multiple test dates: {test_date_str}"
test_date_str = test_date_str[0]
test_date = pd.to_datetime(test_date_str)

In [20]:
# We rebuild target-aware splits to prevent label leakage across train/validation/test windows.
 
# Combine splits and rebuild labels
train_data["_src"] = "train"
valid_data["_src"] = "valid"
test_data["_src"]  = "test"
full_data = pd.concat([train_data, valid_data, test_data], ignore_index=True)

# Ensure datetime and sort
full_data["date"] = pd.to_datetime(full_data["date_ymd"], errors="coerce")
full_data = full_data.sort_values(["station_number","date"]).reset_index(drop=True)

# Recompute y_tomorrow if "snow" exists
if "snow" in full_data.columns:
    full_data["snow"] = full_data["snow"].astype(bool)
    full_data["y_tomorrow"] = (
        full_data.groupby("station_number")["snow"].shift(-1).astype("float").fillna(0.0).astype(int)
    )

# Define target date (label reference)
full_data["target_date"] = full_data["date"] + pd.Timedelta(days=1)

# Validation window (60 days before test date)
val_window_days = 60
val_start = test_date - pd.Timedelta(days=val_window_days)

# Splits
train_split = full_data[ full_data["target_date"] <= val_start ].copy()
valid_split = full_data[ (full_data["target_date"] > val_start) & (full_data["target_date"] < test_date) ].copy()
test_split  = full_data[ full_data["date"].dt.strftime("%Y-%m-%d") == test_date_str ].copy()


In [21]:
assert train_split["target_date"].max() <= val_start
if len(valid_split):
    assert valid_split["target_date"].max() < test_date
assert set(train_split.index).isdisjoint(valid_split.index)
assert set(train_split.index).isdisjoint(test_split.index)
assert set(valid_split.index).isdisjoint(test_split.index)

print("Splits validated: no leakage detected.")

Splits validated: no leakage detected.


In [22]:
#We engineer features based only on past information: lags, rolling means, and seasonal encodings. 
#This ensures predictions use only information available up to each day.


base_numeric = ["total_precipitation","mean_temp","max_temperature"]
base_categoric = ["rain","fog","hail","thunder","tornado"]
target_column = "y_tomorrow"

# Cast types
for col in base_numeric:
    full_data[col] = pd.to_numeric(full_data.get(col), errors="coerce")
for col in base_categoric:
    full_data[col] = full_data.get(col, False).astype(bool)

# Sort before lags
full_data = full_data.sort_values(["station_number","date"]).reset_index(drop=True)

# Lags and rolling means
for col in base_numeric:
    full_data[f"{col}_lag1"] = full_data.groupby("station_number")[col].shift(1)
    full_data[f"{col}_lag3"] = full_data.groupby("station_number")[col].shift(3)
    full_data[f"{col}_lag7"]  = full_data.groupby("station_number")[col].shift(7)
    full_data[f"{col}_lag14"]  = full_data.groupby("station_number")[col].shift(14)
    full_data[f"{col}_roll3_mean"] = (
        full_data.groupby("station_number")[col]
            .transform(lambda s: s.rolling(window=3, min_periods=1).mean())
    )

# Seasonality features
full_data["month"] = full_data["date"].dt.month
doy = full_data["date"].dt.dayofyear
full_data["doy_sin"] = np.sin(2*np.pi*doy/365.25)
full_data["doy_cos"] = np.cos(2*np.pi*doy/365.25)

# Final feature set
feature_columns = (
    base_categoric + base_numeric +
    [f"{c}_lag1" for c in base_numeric] +
    [f"{c}_lag3" for c in base_numeric] +
    [f"{c}_lag7" for c in base_numeric] +     
    [f"{c}_lag14" for c in base_numeric] + 
    [f"{c}_roll3_mean" for c in base_numeric] +
    ["month","doy_sin","doy_cos"]
)

# Slice back to splits
train_features = full_data.loc[train_split.index, :].copy()
valid_features = full_data.loc[valid_split.index, :].copy()
test_features  = full_data.loc[test_split.index,  :].copy()

# Train/Valid/Test sets
X_train, y_train = train_features[feature_columns], train_features[target_column].astype(int)
X_valid, y_valid = valid_features[feature_columns], valid_features[target_column].astype(int)
X_test,  y_test  = test_features[feature_columns],  test_features[target_column].astype(int)


In [23]:
# LogReg training & tuning
# Median imputation handles missing values.
# StandardScaler normalizes numeric features (important for LogReg).


log_reg_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        solver="liblinear",
        max_iter=1000,
        random_state=42,
        class_weight="balanced"
    ))
])

In [24]:
# Fit on train
log_reg_pipeline.fit(X_train, y_train)

# Predict on validation
valid_proba_lr = log_reg_pipeline.predict_proba(X_valid)[:, 1]
valid_pred_lr  = (valid_proba_lr >= 0.5).astype(int)

print("Logistic Regression (0.50 threshold) | "
      f"AUC: {roc_auc_score(y_valid, valid_proba_lr):.3f} "
      f"F1: {f1_score(y_valid, valid_pred_lr):.3f} "
      f"Acc: {accuracy_score(y_valid, valid_pred_lr):.3f}")

# Threshold tuning
def tune_threshold(y_true, proba, metric="f1", grid=None):
    if grid is None:
        grid = np.linspace(0.05, 0.95, 19)
    best_t, best = 0.5, -1.0
    for t in grid:
        preds = (proba >= t).astype(int)
        score = f1_score(y_true, preds) if metric == "f1" else accuracy_score(y_true, preds)
        if score > best:
            best, best_t = score, t
    return best_t, best

best_t_lr, best_f1_lr = tune_threshold(y_valid, valid_proba_lr, metric="f1")
val_preds_lr = (valid_proba_lr >= best_t_lr).astype(int)

print("LogReg Tuned threshold:", best_t_lr)
print("Validation @ tuned threshold | AUC:",
      roc_auc_score(y_valid, valid_proba_lr),
      "F1:", f1_score(y_valid, val_preds_lr),
      "Acc:", accuracy_score(y_valid, val_preds_lr),
      "Prec:", precision_score(y_valid, val_preds_lr),
      "Rec:", recall_score(y_valid, val_preds_lr))
print("ConfusionMatrix:\n", confusion_matrix(y_valid, val_preds_lr))

Logistic Regression (0.50 threshold) | AUC: 0.715 F1: 0.529 Acc: 0.795
LogReg Tuned threshold: 0.39999999999999997
Validation @ tuned threshold | AUC: 0.7148996655518394 F1: 0.5338078291814946 Acc: 0.7779661016949152 Prec: 0.4966887417218543 Rec: 0.5769230769230769
ConfusionMatrix:
 [[384  76]
 [ 55  75]]


In [25]:
# Refit on train + valid
X_train_valid = pd.concat([X_train, X_valid], axis=0)
y_train_valid = pd.concat([y_train, y_valid], axis=0)

log_reg_pipeline.fit(X_train_valid, y_train_valid)

# Test predictions
test_proba_lr = log_reg_pipeline.predict_proba(X_test)[:, 1]
test_pred_lr  = (test_proba_lr >= best_t_lr).astype(int)

# Test metrics
test_metrics_lr = {
    "AUC": roc_auc_score(y_test, test_proba_lr),
    "F1": f1_score(y_test, test_pred_lr),
    "Accuracy": accuracy_score(y_test, test_pred_lr),
    "Precision": precision_score(y_test, test_pred_lr),
    "Recall": recall_score(y_test, test_pred_lr),
    "ConfusionMatrix": confusion_matrix(y_test, test_pred_lr).tolist(),
    "ThresholdUsed": best_t_lr
}

# Save outputs
out_lr = test_df[["station_number","date_ymd"]].copy()
out_lr["pred_proba_snow_tomorrow"] = test_proba_lr
out_lr["pred_label_snow_tomorrow"] = test_pred_lr

Path("data/processed").mkdir(parents=True, exist_ok=True)
out_lr.to_csv("data/processed/test_predictions_logreg.csv", index=False)
pd.DataFrame([test_metrics_lr]).to_csv("data/processed/test_metrics_logreg.csv", index=False)

print("Wrote logistic regression predictions and metrics.")
print("Test Accuracy (LogReg):", test_metrics_lr["Accuracy"])

Wrote logistic regression predictions and metrics.
Test Accuracy (LogReg): 0.9


In [26]:
# Compare LightGBM vs Logistic Regression

# Read metrics
lgbm_metrics = pd.read_csv("data/processed/test_metrics_lightgbm.csv")
logreg_metrics = pd.read_csv("data/processed/test_metrics_logreg.csv")

# Add model names for clarity
lgbm_metrics["Model"] = "LightGBM"
logreg_metrics["Model"] = "Logistic Regression"

# Combine into one table
comparison_df = pd.concat([lgbm_metrics, logreg_metrics], axis=0)

# Optional: reorder columns
cols = ["Model","AUC","F1","Accuracy","Precision","Recall","ConfusionMatrix","ThresholdUsed"]
comparison_df = comparison_df[cols]

# Display comparison
print("Test Metrics Comparison:")
display(comparison_df)

Test Metrics Comparison:


Unnamed: 0,Model,AUC,F1,Accuracy,Precision,Recall,ConfusionMatrix,ThresholdUsed
0,LightGBM,0.6875,0.666667,0.9,1.0,0.5,"[[8, 0], [1, 1]]",0.5
0,Logistic Regression,0.8125,0.666667,0.9,1.0,0.5,"[[8, 0], [1, 1]]",0.4
