In [2]:
import os, sys, random, csv, json, shutil, subprocess, zipfile
from pathlib import Path
from datetime import datetime
from typing import List

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🖥️  Using device: {device}")

# Uncomment the next line the very first time you run the notebook
!pip install -q transformers tensorboard scikit-learn rasterio

# Clone SITS-BERT only once
if not Path("/kaggle/working/SITS-BERT").exists():
    !git clone -q https://github.com/linlei1214/SITS-BERT.git /kaggle/working/SITS-BERT
sys.path.append("/kaggle/working/SITS-BERT")


🖥️  Using device: cpu
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h

In [3]:
!mkdir -p checkpoints_pretrain

In [None]:
!rm -r data/TimeSen2Crop/raw/TimeSen2Crop
!mkdir -p data/TimeSen2Crop/raw
!wget -O data/TimeSen2Crop/raw/TimeSen2Crop.zip https://rslab.disi.unitn.it/timesen2crop/TimeSen2Crop.zip
!unzip -q data/TimeSen2Crop/raw/TimeSen2Crop.zip -d data/TimeSen2Crop/raw
!rm -r data/TimeSen2Crop/raw/TimeSen2Crop.zip

rm: cannot remove 'data/TimeSen2Crop/raw/TimeSen2Crop': No such file or directory
--2025-06-08 14:59:01--  https://rslab.disi.unitn.it/timesen2crop/TimeSen2Crop.zip
Resolving rslab.disi.unitn.it (rslab.disi.unitn.it)... 193.205.194.91
Connecting to rslab.disi.unitn.it (rslab.disi.unitn.it)|193.205.194.91|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1079952573 (1.0G) [application/zip]
Saving to: ‘data/TimeSen2Crop/raw/TimeSen2Crop.zip’


2025-06-08 15:00:58 (8.83 MB/s) - ‘data/TimeSen2Crop/raw/TimeSen2Crop.zip’ saved [1079952573/1079952573]



In [31]:
BANDS: List[str] = ["B02", "B03", "B04", "B05", "B08", "B11"]  # Sentinel-2 L2A
NBANDS                = len(BANDS)          # 6
MAX_SEQ_LEN           = 24                  # always trim/pad to 24 dates
FEATURES              = NBANDS + 1          # +1 for DOY  ⇒ 7-channel tensors

# Where to put intermediate artefacts
PRETRAIN_MANIFEST_CSV = Path("/kaggle/working/timesen2crop_pretrain.csv")
TMP_NPY_DIR           = Path("/kaggle/working/timesen2crop_npy")
TMP_NPY_DIR.mkdir(exist_ok=True, parents=True)

FLAT_144_CSV = "/kaggle/working/t2c_pretrain_flat.csv"      # legacy (24×6)
FLAT_168_CSV = "/kaggle/working/t2c_pretrain_flat_168.csv"  # new  (24×7)

# Location of the raw TimeSen2Crop folder
T2C_ROOT  = Path("data/TimeSen2Crop/raw/TimeSen2Crop")

In [32]:
FMT = "{:.6f}".format                # fixed-width float for the flat CSVs

def pad_or_crop(arr: np.ndarray, L: int) -> np.ndarray:
    """Trim an array with shape (T, C) to length L or 0-pad it."""
    if arr.shape[0] >= L:
        return arr[:L]
    pad = np.zeros((L - arr.shape[0], arr.shape[1]), dtype=arr.dtype)
    return np.vstack([arr, pad])

def read_doy_vector(dates_path: Path) -> np.ndarray:
    """
    Read <sample>/dates.csv and return a 1-D array of day-of-year values,
    normalised to 0-1. If the file is missing or malformed → empty array.
    """
    if not dates_path.exists():
        return np.array([], dtype=np.float32)

    try:
        dates = pd.read_csv(dates_path)["acquisition_date"].astype(str)
        doy   = dates.map(
            lambda x: datetime.strptime(x, "%Y%m%d").timetuple().tm_yday
        ).astype(np.float32) / 366.0
        return doy.values
    except Exception as err:                            # pragma: no cover
        print(f"⚠️  {dates_path}: {err}")
        return np.array([], dtype=np.float32)

def _rowfmt(arr: np.ndarray) -> list[str]:
    """Flatten & stringify a 2-D array for the flat CSVs."""
    return [FMT(x) for x in arr.flatten()]

In [33]:
def build_manifest(max_percent: int = -1) -> None:
    """
    Walk the TimeSen2Crop tree, convert each sample to a (24, 7) tensor
    (B02–B11 + DOY), save it as .npy, and create a manifest CSV.
    """
    csv_files = [
        f for f in T2C_ROOT.rglob("*.csv")
        if f.name.lower() != "dates.csv"
    ]
    total = len(csv_files)
    if 0 < max_percent <= 100:
        keep = max(1, int(total * max_percent / 100))
        csv_files = csv_files[:keep]
        print(f"⚙️  Processing {keep}/{total} files ({max_percent}%)")
    else:
        print(f"⚙️  Processing all {total} files")

    manifest_rows = []

    for csv_path in tqdm(csv_files, desc="TimeSen2Crop"):
        df = pd.read_csv(csv_path)

        # Drop QA flag if present
        if "Flag" in df.columns:
            df = df.drop(columns=["Flag"])

        # --------------------- spectral matrix ----------------------------
        src_cols = df.columns.tolist()
        col_map  = {c.lstrip("0"): i for i, c in enumerate(src_cols)}

        spec = np.zeros((df.shape[0], NBANDS), np.float32)  # (T, 6)
        for tgt, band in enumerate(BANDS):
            src = col_map.get(band.replace("B0", "B").lstrip("0"), -1)
            if src != -1:
                spec[:, tgt] = np.clip(df.iloc[:, src].values / 1e4, 0, 1)

        # --------------------------- DOY ---------------------------------
        doy = read_doy_vector(csv_path.parent / "dates.csv")
        if doy.size != spec.shape[0]:
            doy = np.zeros(spec.shape[0], np.float32)       # fallback

        spec_doy = np.hstack([spec, doy[:, None]])          # (T, 7)

        # ------ trim/pad to 24 dates & save --------------------------------
        arr = pad_or_crop(spec_doy, MAX_SEQ_LEN).astype(np.float32)
        rel = csv_path.relative_to(T2C_ROOT).with_suffix(".npy")
        out = TMP_NPY_DIR / rel
        out.parent.mkdir(parents=True, exist_ok=True)
        np.save(out, arr)

        manifest_rows.append({"file": str(out), "label": -1, "date": ""})
        os.remove(csv_path)                                 # reclaim space

    pd.DataFrame(manifest_rows).to_csv(PRETRAIN_MANIFEST_CSV, index=False)
    print(f"✔️  Manifest saved to {PRETRAIN_MANIFEST_CSV} "
          f"({len(manifest_rows):,} samples)")

# Kick it off (set max_percent to e.g. 10 for a quick smoke-test)
build_manifest(max_percent=-1)

⚙️  Processing all 1212224 files


TimeSen2Crop: 100%|██████████| 1212224/1212224 [51:25<00:00, 392.87it/s] 


✔️  Manifest saved to /kaggle/working/timesen2crop_pretrain.csv (1,212,224 samples)


In [34]:
print("\n📝  Writing flattened CSVs …")

# 4.1 144 floats (6 bands × 24)
with open(FLAT_144_CSV, "w", newline="") as fout:
    wr = csv.writer(fout)
    for f in tqdm(pd.read_csv(PRETRAIN_MANIFEST_CSV)["file"], desc="→ 144-col"):
        wr.writerow(_rowfmt(np.load(f)[:, :NBANDS]))
print(f"✔️  144-wide CSV   → {FLAT_144_CSV}")

# 4.2 168 floats (6 bands + DOY × 24)
with open(FLAT_168_CSV, "w", newline="") as fout:
    wr = csv.writer(fout)
    for f in tqdm(pd.read_csv(PRETRAIN_MANIFEST_CSV)["file"], desc="→ 168-col"):
        wr.writerow(_rowfmt(np.load(f)))
print(f"✔️  168-wide CSV   → {FLAT_168_CSV}")


📝  Writing flattened CSVs …


→ 144-col: 100%|██████████| 1212224/1212224 [08:06<00:00, 2494.13it/s]


✔️  144-wide CSV   → /kaggle/working/t2c_pretrain_flat.csv


→ 168-col: 100%|██████████| 1212224/1212224 [06:31<00:00, 3099.92it/s]


✔️  168-wide CSV   → /kaggle/working/t2c_pretrain_flat_168.csv


In [35]:
# Update pretraining.py to your CSV/checkpoint paths, bands, etc.
!sed -i "s#dataset_path = .*#dataset_path = '/kaggle/working/t2c_pretrain_flat_168.csv'#"   /kaggle/working/SITS-BERT/code/pretraining.py
!sed -i "s#pretrain_path = .*#pretrain_path = '/kaggle/working/checkpoints_pretrain/'#"  /kaggle/working/SITS-BERT/code/pretraining.py
!sed -i "s#num_features = .*#num_features = 6#"   /kaggle/working/SITS-BERT/code/pretraining.py
!sed -i "s#max_length = .*#max_length = 24#"      /kaggle/working/SITS-BERT/code/pretraining.py
!sed -i "s#batch_size = .*#batch_size = 256#"     /kaggle/working/SITS-BERT/code/pretraining.py



In [37]:
!python /kaggle/working/SITS-BERT/code/pretraining.py

^C
2025-06-05 23:55:37.685347: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749167737.713865     240 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749167737.722193     240 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
