In [None]:
"""
LLM-NID Forecasting · Minimal Reproducible Core
================================================
This script implements the *public* trunk of the infectious-disease forecasting pipeline.
It fine-tunes a LoRA-adapted Qwen-2.5-3B model on a single (disease, outcome) time series and
performs evaluation on a 60/20/20 split. Update path variables and hyperparameters to suit
your environment.
"""

import os, random, json, re
from copy import deepcopy
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModel,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)
from peft import LoraConfig, get_peft_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# -------- 1. Configuration & Hyperparameters --------
SEED             = 3407
PRETRAINED_MODEL = os.getenv("QWEN_PATH", "./qwen-2_5-3b")  # Absolute path or Hugging Face repo
DATA_FILE        = "./china_nid_2009_2025.xlsx"              # Excel with columns: 指标, 日期, diseases...
DEVICE           = "cuda:0" if torch.cuda.is_available() else "cpu"
MAX_EPOCHS       = 20
BATCH_SIZE       = 16
LEARNING_RATE    = 5e-5
LOSS_ALPHA       = 10.0  # Weight factor for target-based loss scaling

# -------- 2. Utility Functions --------
DATE_PATTERN = re.compile(r"(\d{4})年(\d{1,2})月")

def extract_date(text: str):
    match = DATE_PATTERN.search(text)
    if not match:
        raise ValueError(f"No date found in '{text}'")
    return int(match.group(1)), int(match.group(2))

def excel_date_to_text(raw):
    """Convert raw Excel date (number/date/string) to 'YYYY年M月'."""
    if isinstance(raw, (int, float)):
        base = datetime(1899, 12, 30)
        real = base + timedelta(days=float(raw))
    else:
        real = pd.to_datetime(raw)
    return f"{real.year}年{real.month}月"

# -------- 3. Dataset Preparation --------
def build_dataset(xlsx_path):
    df_raw = pd.read_excel(xlsx_path)
    disease_columns = df_raw.columns[2:]
    records = []
    for _, row in df_raw.iterrows():
        measure = str(row["指标"]).strip()
        date_str = excel_date_to_text(row["日期"])
        for disease in disease_columns:
            value = 0.0 if pd.isna(row[disease]) else float(row[disease])
            records.append({
                "instruction": f"{date_str}{disease}{measure}",
                "output": value
            })
    df = pd.DataFrame(records)
    df["output_log"] = np.log1p(df.output)
    return df

# -------- 4. PyTorch Dataset & Collate Function --------
class TSRegressionDataset(Dataset):
    def __init__(self, data_tuples, tokenizer, max_length=128):
        self.targets = [t[0] for t in data_tuples]
        self.texts   = [t[1] for t in data_tuples]
        self.enc     = tokenizer(self.texts, truncation=True, padding=True, max_length=max_length)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.enc.items()}
        item["targets"] = torch.tensor(self.targets[idx], dtype=torch.float)
        item["text"]    = self.texts[idx]
        return item

def collate_batch(batch):
    batched = {
        "input_ids": torch.stack([b["input_ids"] for b in batch]),
        "attention_mask": torch.stack([b["attention_mask"] for b in batch]),
        "targets": torch.stack([b["targets"] for b in batch])
    }
    batched["texts"] = [b["text"] for b in batch]
    return batched

# -------- 5. Model Definition: LoRA-Qwen + Time Embeddings --------
def load_lora_qwen_model():
    base_model = AutoModel.from_pretrained(
        PRETRAINED_MODEL,
        torch_dtype=torch.float16,
        device_map={"": 0}
    )
    # Freeze all except embeddings and final two layers
    for name, param in base_model.named_parameters():
        param.requires_grad = any(
            key in name for key in ["embed_tokens", "layers.30", "layers.31"]
        )
    peft_conf = LoraConfig(
        r=8,
        lora_alpha=64,
        target_modules=["q_proj","k_proj","v_proj","o_proj","down_proj","up_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="SEQ_CLS"
    )
    return get_peft_model(base_model, peft_conf)

class TimeSeriesLoRAModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = load_lora_qwen_model()
        hidden = self.backbone.config.hidden_size
        self.year0, self.n_year = 2004, 40
        embed_dim = 128
        self.year_emb  = nn.Embedding(self.n_year, embed_dim)
        self.month_emb = nn.Embedding(12, embed_dim)
        self.dropout   = nn.Dropout(0.4)
        self.regressor = nn.Sequential(
            nn.Linear(hidden + 2*embed_dim, hidden),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(hidden, 1)
        )
        self.loss_fn = nn.MSELoss(reduction="none")

    def forward(self, input_ids, attention_mask, targets=None, texts=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        mask = attention_mask.unsqueeze(-1).float()
        pooled = (out.last_hidden_state * mask).sum(1) / mask.sum(1).clamp(min=1e-9)
        if texts is not None:
            years, months = zip(*(extract_date(t) for t in texts))
            years = torch.tensor([y - self.year0 for y in years], dtype=torch.long, device=DEVICE)
            months = torch.tensor([m-1 for m in months], dtype=torch.long, device=DEVICE)
            ext = torch.cat([self.year_emb(years), self.month_emb(months)], dim=1)
            pooled = torch.cat([pooled, ext], dim=1)
        preds = self.regressor(self.dropout(pooled)).squeeze(-1)
        loss = None
        if targets is not None:
            weights = 1.0 + LOSS_ALPHA * targets
            raw = self.loss_fn(preds, targets)
            loss = (weights * raw).mean()
        return preds, loss

# -------- 6. Training and Evaluation Utilities --------
def train_one_epoch(model, dataloader, optimizer, scaler, scheduler):
    model.train()
    total_loss = 0.0
    for batch in tqdm(dataloader, desc="Training", leave=False):
        optimizer.zero_grad()
        with torch.cuda.amp.autocast(enabled=DEVICE.startswith("cuda")):
            preds, loss = model(
                input_ids=batch["input_ids"].to(DEVICE),
                attention_mask=batch["attention_mask"].to(DEVICE),
                targets=batch["targets"].to(DEVICE),
                texts=batch["texts"]
            )
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, data_scaler):
    model.eval()
    all_t, all_p, all_txt = [], [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            preds, _ = model(
                input_ids=batch["input_ids"].to(DEVICE),
                attention_mask=batch["attention_mask"].to(DEVICE),
                texts=batch["texts"]
            )
            all_t.extend(batch["targets"].cpu().numpy())
            all_p.extend(preds.cpu().numpy())
            all_txt.extend(batch["texts"])
    t_log = data_scaler.inverse_transform(np.array(all_t)[:,None]).flatten()
    p_log = data_scaler.inverse_transform(np.array(all_p)[:,None]).flatten()
    actual = np.expm1(t_log)
    predicted = np.expm1(p_log).clip(min=0)
    return {
        "mse": mean_squared_error(actual, predicted),
        "mae": mean_absolute_error(actual, predicted)
    }, actual, predicted, all_txt

# -------- 7. Single Series Demo --------
def main():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL, trust_remote_code=True)

    df_all = build_dataset(DATA_FILE)
    # >>> Customize this section for your disease and measure <<<
    DISEASE = "Influenza"
    OUTCOME = "发病数"
    subset = df_all[df_all.instruction.str.contains(DISEASE) & df_all.instruction.str.contains(OUTCOME)]
    subset = subset.sort_values(by="instruction", key=lambda s: s.map(lambda x: extract_date(x)))
    assert len(subset) > 10, "Insufficient data for selected series"

    N = len(subset)
    n_train = int(0.6 * N)
    n_val   = int(0.2 * N)
    train_df, val_df, test_df = subset[:n_train], subset[n_train:n_train+n_val], subset[n_train+n_val:]

    # Build data loaders
    scaler = MinMaxScaler().fit(train_df.output_log.values.reshape(-1,1))
    def mk_loader(df):
        scaled = scaler.transform(df.output_log.values.reshape(-1,1)).flatten()
        data = list(zip(scaled, df.instruction.tolist()))
        ds = TSRegressionDataset(data, tokenizer)
        return DataLoader(ds, batch_size=BATCH_SIZE, shuffle=(df is train_df), collate_fn=collate_batch)
    dl_train, dl_val, dl_test = mk_loader(train_df), mk_loader(val_df), mk_loader(test_df)

    model = TimeSeriesLoRAModel().to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.05)
    scheduler = get_linear_schedule_with_warmup(optimizer, 200, len(dl_train) * MAX_EPOCHS)
    scaler_amp = torch.cuda.amp.GradScaler(enabled=DEVICE.startswith("cuda"))

    best_mse, patience = float('inf'), 0
    for epoch in range(MAX_EPOCHS):
        loss = train_one_epoch(model, dl_train, optimizer, scaler_amp, scheduler)
        metrics_val, _, _, _ = evaluate_model(model, dl_val, scaler)
        print(f"Epoch {epoch+1}/{MAX_EPOCHS} - train_loss={loss:.4f}, val_mse={metrics_val['mse']:.4f}")
        if metrics_val['mse'] < best_mse:
            best_mse, patience = metrics_val['mse'], 0
            best_state = deepcopy(model.state_dict())
        else:
            patience += 1
            if patience >= 3: break

    model.load_state_dict(best_state)
    metrics_test, y_true, y_pred, txt = evaluate_model(model, dl_test, scaler)
    print("Test metrics:", metrics_test)

if __name__ == "__main__":
    main()
