## Predictive modeling - Logistic Regression model (Rain tomorrow)

In [16]:
#Imports & config
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    classification_report, confusion_matrix
)

pd.set_option("display.max_columns", None)
np.random.seed(42)


In [17]:
#Load data
PATH = "../data/processed/kandy_weather_cleaned.csv"

df = pd.read_csv(PATH)

# Quick peek
print(df.shape)
df.head()


(4916, 17)


Unnamed: 0,time,weathercode,max_temperature,min_temperature,mean_temperature,apparent_temperature_mean,shortwave_radiation,rainfall,max_wind_speed,max_wind_gust,dominant_Wind_Direction,evapotranspiration,rain_today,rain_tomorrow,year,month,day
0,2010-01-01,2,25.3,18.7,22.0,23.8,18.09,0.0,15.3,38.9,57,3.43,0,0,2010,1,1
1,2010-01-02,2,25.6,19.4,22.2,24.8,15.47,0.0,13.3,33.1,46,2.95,0,1,2010,1,2
2,2010-01-03,51,25.0,19.5,22.1,25.0,14.99,0.1,10.7,28.1,40,2.84,1,1,2010,1,3
3,2010-01-04,51,25.3,19.0,21.9,24.3,16.55,0.6,10.4,29.2,23,3.16,1,0,2010,1,4
4,2010-01-05,1,25.0,16.3,20.8,22.6,22.68,0.0,9.7,27.0,28,4.18,0,0,2010,1,5


### Basic cleaning & assertions

- Parse dates

- Ensure target is clean (binary 0/1)

- Remove rows missing critical fields

In [18]:
#Basic cleaning
df["time"] = pd.to_datetime(df["time"], errors="coerce")

# Seasonality via MONTH as categorical
df["month"] = df["time"].dt.month.astype("Int64").astype("category")

# Ensure target exists and is binary int (0/1)
assert "rain_tomorrow" in df.columns, "Missing target column: rain_tomorrow"
if df["rain_tomorrow"].dtype not in [np.int64, np.int32, "int64", "int32"]:
    map_bin = {"Yes":1,"No":0,"Y":1,"N":0, True:1, False:0, 1:1, 0:0}
    df["rain_tomorrow"] = df["rain_tomorrow"].map(map_bin).astype("Int64").astype(int)

# If 'rain_today' exists, coerce to numeric (not mandatory if absent)
if "rain_today" in df.columns:
    df["rain_today"] = pd.to_numeric(df["rain_today"], errors="coerce")

# Drop rows without time or target (rare but safest)
df = df.dropna(subset=["time", "rain_tomorrow"]).reset_index(drop=True)

print("Rows, Cols:", df.shape)
print("Target balance (0=no rain, 1=rain):")
print(df["rain_tomorrow"].value_counts(normalize=True).round(3))


Rows, Cols: (4916, 17)
Target balance (0=no rain, 1=rain):
rain_tomorrow
1    0.8
0    0.2
Name: proportion, dtype: float64


### Engineer wind direction (circular) features

- Convert degrees → radians → sin/cos

- Drop the raw direction column to avoid redundancy

In [19]:
# Circular encoding for wind direction (if present)
if "dominant_Wind_Direction" in df.columns:
    # Coerce and wrap to [0, 360)
    df["dominant_Wind_Direction"] = pd.to_numeric(df["dominant_Wind_Direction"], errors="coerce") % 360
    rad = np.deg2rad(df["dominant_Wind_Direction"])
    df["wind_dir_sin"] = np.sin(rad)
    df["wind_dir_cos"] = np.cos(rad)
    df = df.drop(columns=["dominant_Wind_Direction"], errors="ignore")


### Chronological ordering

In [20]:
# Sort chronologically (very important for a 'tomorrow' target)
df = df.sort_values("time").reset_index(drop=True)

print("Date range:", df["time"].min(), "→", df["time"].max())


Date range: 2010-01-01 00:00:00 → 2023-06-17 00:00:00


In [21]:
# Quick nulls & schema check
print("Top columns by missing values:")
print(df.isna().sum().sort_values(ascending=False).head(12))

# Glance at key columns if you like
key_cols = [c for c in ["weathercode","rainfall","max_wind_speed","max_wind_gust",
                        "evapotranspiration","rain_today","mean_temperature",
                        "wind_dir_sin","wind_dir_cos"] if c in df.columns]
df[key_cols].head(3)


Top columns by missing values:
time                         0
weathercode                  0
max_temperature              0
min_temperature              0
mean_temperature             0
apparent_temperature_mean    0
shortwave_radiation          0
rainfall                     0
max_wind_speed               0
max_wind_gust                0
evapotranspiration           0
rain_today                   0
dtype: int64


Unnamed: 0,weathercode,rainfall,max_wind_speed,max_wind_gust,evapotranspiration,rain_today,mean_temperature,wind_dir_sin,wind_dir_cos
0,2,0.0,15.3,38.9,3.43,0,22.0,0.838671,0.544639
1,2,0.0,13.3,33.1,2.95,0,22.2,0.71934,0.694658
2,51,0.1,10.7,28.1,2.84,1,22.1,0.642788,0.766044


### Build feature lists dynamically

In [22]:
# Feature lists (month categorical)

target_col = "rain_tomorrow"
exclude = {"time", target_col}

# Numeric predictors (no year/day; no DOY sin/cos)
maybe_numeric = {
    "max_temperature","min_temperature","mean_temperature","apparent_temperature_mean",
    "shortwave_radiation","rainfall","max_wind_speed","max_wind_gust",
    "evapotranspiration","rain_today","wind_dir_sin","wind_dir_cos"
}
numeric_features = sorted(list((maybe_numeric & set(df.columns)) - exclude))

# Categorical predictors: month (+ weathercode if present)
categorical_features = ["month"]
if "weathercode" in df.columns:
    df["weathercode"] = df["weathercode"].astype("Int64").astype("category")
    categorical_features.append("weathercode")

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)


Numeric features: ['apparent_temperature_mean', 'evapotranspiration', 'max_temperature', 'max_wind_gust', 'max_wind_speed', 'mean_temperature', 'min_temperature', 'rain_today', 'rainfall', 'shortwave_radiation', 'wind_dir_cos', 'wind_dir_sin']
Categorical features: ['month', 'weathercode']


### Chronological train/test/validate split

In [None]:
# Time-based split with validation (70/10/20)

n = len(df)
test_frac = 0.20
val_frac  = 0.10  # of the full dataset
train_end = int(n * (1 - test_frac - val_frac))  # 70%
val_end   = int(n * (1 - test_frac))             # 80%

train_df = df.iloc[:train_end].copy()
val_df   = df.iloc[train_end:val_end].copy()
test_df  = df.iloc[val_end:].copy()

feature_cols = numeric_features + categorical_features

X_train = train_df[feature_cols]
y_train = train_df[target_col].astype(int)

X_val   = val_df[feature_cols]
y_val   = val_df[target_col].astype(int)

X_test  = test_df[feature_cols]
y_test  = test_df[target_col].astype(int)

print("Sizes:")
print("  Train:", X_train.shape, "Pos rate:", y_train.mean().round(3))
print("  Val  :", X_val.shape,   "Pos rate:", y_val.mean().round(3))
print("  Test :", X_test.shape,  "Pos rate:", y_test.mean().round(3))


Sizes:
  Train: (3441, 14) Pos rate: 0.789
  Val  : (491, 14) Pos rate: 0.786
  Test : (984, 14) Pos rate: 0.849


In [None]:
# Preprocess: impute/scale numeric, impute/one-hot categorical
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, numeric_features),
        ("cat", categorical_pipe, categorical_features)
    ],
    remainder="drop"
)

print("Preprocessing ready.")
