In [1]:
import numpy as np
import pandas as pd
import tqdm

In [2]:
df=pd.read_pickle(r"/home/f20222001/test-venv/UHI/Datasets_processed/pixelated_db_40k.pkl")
df

Unnamed: 0,Year,Month,Aggregate_Month,City,Pointwise_Data
0,2000,1,1,Hyderabad,LST NDVI NDBI Albedo...
1,2000,1,1,Chennai,LST NDVI NDBI Albedo...
2,2000,1,1,Surat,LST NDVI NDBI Albedo...
3,2000,1,1,Ahmedabad,LST NDVI NDBI Albedo...
4,2000,2,2,Delhi,LST NDVI NDBI Albed...
...,...,...,...,...,...
1798,2024,12,300,Chennai,LST NDVI NDBI Albedo...
1799,2024,12,300,Pune,LST NDVI NDBI Albedo...
1800,2024,12,300,Kanpur,LST NDVI NDBI Albedo...
1801,2024,12,300,Surat,LST NDVI NDBI Albedo...


In [3]:
df = df[~df['Month'].isin([6, 9])]

df=df.sort_values(by=["City","Year","Month"])
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,Year,Month,Aggregate_Month,City,Pointwise_Data
0,2000,1,1,Ahmedabad,LST NDVI NDBI Albedo...
1,2000,2,2,Ahmedabad,LST NDVI NDBI Albedo...
2,2000,3,3,Ahmedabad,LST NDVI NDBI Albedo...
3,2000,4,4,Ahmedabad,LST NDVI NDBI Albed...
4,2000,5,5,Ahmedabad,LST NDVI NDBI Albedo...
...,...,...,...,...,...
1601,2024,4,292,Surat,LST NDVI NDBI Albedo...
1602,2024,5,293,Surat,LST NDVI NDBI Albedo...
1603,2024,10,298,Surat,LST NDVI NDBI Albedo...
1604,2024,11,299,Surat,LST NDVI NDBI Albedo...


In [None]:
def batching(df, city, seq_length, target_col_idx=0, num_points=40000,forecast_horizon=12):
    """
    Parameters:
    - df: DataFrame with City, Month, Aggregate_Month, and Pointwise_Data
    - city: target city to extract sequences for
    - seq_length: how many months of history to use
    - target_col_idx: index of LST (or other target) in original pointwise features
    - num_points: number of spatial points per city

    Returns:
    - X: input sequences of shape (num_sequences * num_points, seq_length, num_features)
    - y: target values of shape (num_sequences * num_points,)
    """
    # 1. Add sin/cos month encoding
    LST_MIN, LST_MAX = -10, 65
    NDVI_MIN, NDVI_MAX = -1, 1
    ALBEDO_MIN, ALBEDO_MAX = 0, 1

    NDVI_IDX = 1
    ALBEDO_IDX = 3
    LST_IDX=0
    df["month_sin"] = np.sin(2 * np.pi * df["Month"] / 12)
    df["month_cos"] = np.cos(2 * np.pi * df["Month"] / 12)

    # 2. One-hot encode city
    onehot = pd.get_dummies(df["City"], prefix="city")
    df = pd.concat([df, onehot], axis=1)
    city_onehot_cols = onehot.columns.tolist()

    # 3. Preprocess each row to inject sin/cos + agg_month + one-hot into Pointwise_Data
    def preprocess_pointwise_data(row):
        features = row["Pointwise_Data"]  # shape: (1000, original_features)
        sin_cos = np.array([[row["month_sin"], row["month_cos"]]] * num_points)
        agg_month = np.array([[row["Aggregate_Month"]]] * num_points)
        city_onehot = row[city_onehot_cols].to_numpy().reshape(1, -1)
        city_onehot_repeated = np.repeat(city_onehot, num_points, axis=0)

        return np.concatenate([features, sin_cos, agg_month, city_onehot_repeated], axis=1)

    df["Pointwise_Data"] = df.apply(preprocess_pointwise_data, axis=1)

    # 4. Filter for the selected city and sort
    city_df = df[df["City"] == city].sort_values("Aggregate_Month").reset_index(drop=True)

    # 5. Stack into a 3D tensor: (num_months, 1000, num_features)
    stacked = np.stack(city_df["Pointwise_Data"].to_numpy())
    num_months, num_points, num_features = stacked.shape

    X_list = []
    y_list = []

    # 6. Sliding window over time
    for i in range(num_months - seq_length - forecast_horizon + 1):  # updated range
        seq_x = stacked[i : i + seq_length]
        target_seq = stacked[i + seq_length : i + seq_length + forecast_horizon]

        for p in range(num_points):
            x_seq = seq_x[:, p, :].astype(np.float32)
            y_seq = target_seq[:, p, target_col_idx].astype(np.float32)  # shape: (forecast_horizon,)

            if (
                np.isnan(x_seq).any() or np.isnan(y_seq).any()
                or not np.all((LST_MIN <= y_seq) & (y_seq <= LST_MAX))
                or not np.all((LST_MIN <= x_seq[:, LST_IDX]) & (x_seq[:, LST_IDX] <= LST_MAX))
                or not np.all((NDVI_MIN <= x_seq[:, NDVI_IDX]) & (x_seq[:, NDVI_IDX] <= NDVI_MAX))
                or not np.all((ALBEDO_MIN <= x_seq[:, ALBEDO_IDX]) & (x_seq[:, ALBEDO_IDX] <= ALBEDO_MAX))
            ):
                continue

            X_list.append(x_seq)
            y_list.append(y_seq)  # append the whole 6-month target

    X = np.stack(X_list).astype(np.float32)  # shape: (n, seq_len, num_features)
    y = np.stack(y_list).astype(np.float32)  # shape: (n, forecast_horizon)
    return X, y

In [5]:
import numpy as np
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

cities = ["Delhi", "Hyderabad", "Mumbai", "Bangalore", "Kolkata",
          "Chennai", "Pune", "Kanpur", "Surat", "Ahmedabad"]

def process_city(city):
    x, y = batching(df, city, 72)
    return x, y

if __name__ == "__main__":
    All_x = []
    All_y = []

    with ProcessPoolExecutor() as executor:
        futures = [executor.submit(process_city, city) for city in cities]

        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing cities"):
            x, y = future.result()
            All_x.append(x)
            All_y.append(y)

    inputs = np.concatenate(All_x)
    targets = np.concatenate(All_y)


Processing cities:   0%|          | 0/10 [00:06<?, ?it/s]Process ForkProcess-64:
Process ForkProcess-62:
Process ForkProcess-11:
Process ForkProcess-26:
Process ForkProcess-60:
Process ForkProcess-54:
Process ForkProcess-20:
Process ForkProcess-49:
Process ForkProcess-44:
Process ForkProcess-61:
Process ForkProcess-57:
Process ForkProcess-33:
Process ForkProcess-52:
Process ForkProcess-51:
Process ForkProcess-56:
Process ForkProcess-55:
Process ForkProcess-34:
Process ForkProcess-46:
Process ForkProcess-39:
Process ForkProcess-63:
Process ForkProcess-47:
Process ForkProcess-50:
Process ForkProcess-53:
Process ForkProcess-43:
Process ForkProcess-48:
Process ForkProcess-42:
Process ForkProcess-36:
Process ForkProcess-40:
Process ForkProcess-59:
Process ForkProcess-12:
Process ForkProcess-38:
Process ForkProcess-45:
Process ForkProcess-29:
Process ForkProcess-21:
Process ForkProcess-28:
Process ForkProcess-19:
Process ForkProcess-32:
Process ForkProcess-37:
Process ForkProcess-31:
Process

KeyboardInterrupt: 

In [None]:
# cities=["Delhi","Hyderabad", "Mumbai", "Bangalore","Kolkata","Chennai","Pune",\
#         "Kanpur","Surat","Ahmedabad"]

# All_x=[]
# All_y=[]
# for city in cities:
#     x,y=batching(df,city,12)
#     All_x.append(x)
#     All_y.append(y)
# inputs=np.concatenate(All_x)
# targets=np.concatenate(All_y)

In [None]:
inputs.shape

(1127854, 36, 17)

In [None]:
targets.shape

(1127854, 12)

In [None]:
# import pickle

# # Save inputs
# with open("inputs_36_extra_clean_40k_72.pkl", "wb") as f:
#     pickle.dump(inputs, f)

# # Save targets
# with open("targets_36_extra_clean_40k_72.pkl", "wb") as f:
#     pickle.dump(targets, f)
