In [2]:
# ============================================================
#  Raw‑test‑set generator  |  Hirakud & upstream stations
#  ------------------------------------------------------------
#  • No smoothing, no scaling – pure values from the sheet
#  • Produces y_test arrays shaped (samples, 5) for every gauge
# ============================================================

import numpy as np
import pandas as pd
from numpy  import array
from pathlib import Path

# ---------- CONFIG ----------

SEED            = 42
EXCEL_FILE      = "Data/new data.xlsx"   # <-- adjust if the folder/name differ
DATE_COLUMN     = "Date"                                  # column containing the calendar date
RAINFALL_COL    = "Mean_areal_rainfall_upper"             # rainfall column name

# every discharge / water‑level column you want a raw test‑set for
STATIONS = [
    "Inflow",        # Hirakud reservoir inflow  (note the space in your sheet)
    "Sundargarh",
    "Kurubhata",
    "Basantpur",
    "Ghatora",
    "Kelo",
    "Paramanpur",
    "Simga",
    "Rajim"
]

# sequence lengths
N_STEPS_IN  = 30    # look‑back window (days)
N_STEPS_OUT = 10     # forecast horizon (days)

# ------------------------------------------------------------

np.random.seed(SEED)

# 1. ------------ READ THE WORKBOOK (no smoothing / scaling) --
print("Reading Excel file...")
use_cols = [DATE_COLUMN, RAINFALL_COL] + STATIONS
df = pd.read_excel(EXCEL_FILE, usecols=use_cols, engine="openpyxl")

# basic clean‑up
df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN])
df.set_index(DATE_COLUMN, inplace=True)
df.ffill(inplace=True); df.bfill(inplace=True)

print(f"Workbook shape: {df.shape}")
print(f"Date range     : {df.index.min().date()}  →  {df.index.max().date()}")

# 2. ------------ TRAIN / TEST DATE SPLIT --------------------
train_df = df.loc["2005-01-01" : "2010-12-31"]
test_df  = df.loc["2011-01-01" : "2014-12-31"]

print("\nSplit sizes:")
print("  train :", train_df.shape)
print("  test  :", test_df.shape)

# 3. ------------ SEQUENCE SPLITTER --------------------------
def split_sequences(data, steps_in, steps_out, target_idx):
    """Return X, y arrays where y is the target column only."""
    X, y = [], []
    for i in range(len(data)):
        end_ix = i + steps_in
        out_end = end_ix + steps_out
        if out_end > len(data):
            break
        seq_x = data[i:end_ix, :]
        seq_y = data[end_ix:out_end, target_idx]
        X.append(seq_x);  y.append(seq_y)
    return array(X), array(y)

# 4. ------------ LOOP OVER EVERY STATION --------------------
OUT_DIR = Path("Data")
OUT_DIR.mkdir(parents=True, exist_ok=True)

for tgt in STATIONS:
    target_idx   = test_df.columns.get_loc(tgt)
    X_test, y_test = split_sequences(test_df.values,
                                     N_STEPS_IN,
                                     N_STEPS_OUT,
                                     target_idx)

    # filenames friendly to Linux
    safe_name = tgt.strip().lower().replace(" ", "_")

    # save .npy
    np.save(OUT_DIR / f"raw_test_y_{safe_name}.npy", y_test)

    # save .csv for easy inspection
    cols = [f"{safe_name}_t+{i+1}" for i in range(N_STEPS_OUT)]
    pd.DataFrame(y_test, columns=cols).to_csv(
        OUT_DIR / f"raw_test_y_{safe_name}_10_steps_ahead.csv",
        index=False
    )

    print(f"Saved {y_test.shape[0]} samples for {tgt.strip()}  →  {safe_name}")

print("\nDone!  All raw test‑set files are in:", OUT_DIR)


Reading Excel file...
Workbook shape: (1830, 10)
Date range     : 2000-06-01  →  2014-09-30

Split sizes:
  train : (732, 10)
  test  : (488, 10)
Saved 449 samples for Inflow  →  inflow
Saved 449 samples for Sundargarh  →  sundargarh
Saved 449 samples for Kurubhata  →  kurubhata
Saved 449 samples for Basantpur  →  basantpur
Saved 449 samples for Ghatora  →  ghatora
Saved 449 samples for Kelo  →  kelo
Saved 449 samples for Paramanpur  →  paramanpur
Saved 449 samples for Simga  →  simga
Saved 449 samples for Rajim  →  rajim

Done!  All raw test‑set files are in: Data


  df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN])
