In [1]:
import numpy as np
import pandas as pd
import tqdm

In [3]:
df=pd.read_pickle(r"/home/f20222001/test-venv/UHI/Datasets_processed/pixelated_db_10_kf.pkl")
df

Unnamed: 0,Year,Month,Aggregate_Month,City,Pointwise_Data
0,2000,1,1,Delhi,
1,2000,1,1,Hyderabad,LST NDVI NDBI Albedo ...
2,2000,1,1,Mumbai,
3,2000,1,1,Bangalore,
4,2000,1,1,Kolkata,
...,...,...,...,...,...
2995,2024,12,300,Chennai,LST NDVI NDBI Albedo ...
2996,2024,12,300,Pune,LST NDVI NDBI Albedo ...
2997,2024,12,300,Kanpur,LST NDVI NDBI Albedo ...
2998,2024,12,300,Surat,LST NDVI NDBI Albedo ...


In [4]:
df=df.sort_values(by=["City","Year","Month"])
df.reset_index(inplace=True, drop=True)

In [5]:
def fill_w_nans(row, num_points):
    if isinstance(row["Pointwise_Data"], float) and np.isnan(row["Pointwise_Data"]):
        return np.full((num_points, 4), np.nan)
    else:
        return row["Pointwise_Data"]

filled_df=df.copy()
filled_df["Pointwise_Data"] = filled_df.apply(lambda row: fill_w_nans(row, 10000), axis=1)
filled_df

Unnamed: 0,Year,Month,Aggregate_Month,City,Pointwise_Data
0,2000,1,1,Ahmedabad,LST NDVI NDBI Albedo ...
1,2000,2,2,Ahmedabad,LST NDVI NDBI Albedo ...
2,2000,3,3,Ahmedabad,LST NDVI NDBI Albedo ...
3,2000,4,4,Ahmedabad,LST NDVI NDBI Albedo ...
4,2000,5,5,Ahmedabad,LST NDVI NDBI Albedo ...
...,...,...,...,...,...
2995,2024,8,296,Surat,"[[nan, nan, nan, nan], [nan, nan, nan, nan], [..."
2996,2024,9,297,Surat,LST NDVI NDBI Albedo ...
2997,2024,10,298,Surat,LST NDVI NDBI Albedo ...
2998,2024,11,299,Surat,LST NDVI NDBI Albedo ...


In [None]:
def batching(df, city, seq_length, target_col_idx=0, num_points=10000):
    """
    Parameters:
    - df: DataFrame with City, Month, Aggregate_Month, and Pointwise_Data
    - city: target city to extract sequences for
    - seq_length: how many months of history to use
    - target_col_idx: index of LST (or other target) in original pointwise features
    - num_points: number of spatial points per city

    Returns:
    - X: input sequences of shape (num_sequences * num_points, seq_length, num_features)
    - y: target values of shape (num_sequences * num_points,)
    """
    # 1. Add sin/cos month encoding
    LST_MIN, LST_MAX = -10, 65
    NDVI_MIN, NDVI_MAX = -1, 1
    ALBEDO_MIN, ALBEDO_MAX = 0, 1
    LST_IDX= 0
    NDVI_IDX = 1
    ALBEDO_IDX = 3
    df["month_sin"] = np.sin(2 * np.pi * df["Month"] / 12)
    df["month_cos"] = np.cos(2 * np.pi * df["Month"] / 12)

    # 2. One-hot encode city
    onehot = pd.get_dummies(df["City"], prefix="city")
    df = pd.concat([df, onehot], axis=1)
    city_onehot_cols = onehot.columns.tolist()

    # 3. Preprocess each row to inject sin/cos + agg_month + one-hot into Pointwise_Data
    def preprocess_pointwise_data(row):
        features = row["Pointwise_Data"]  # shape: (1000, original_features)
        sin_cos = np.array([[row["month_sin"], row["month_cos"]]] * num_points)
        agg_month = np.array([[row["Aggregate_Month"]]] * num_points)
        city_onehot = row[city_onehot_cols].to_numpy().reshape(1, -1)
        city_onehot_repeated = np.repeat(city_onehot, num_points, axis=0)

        return np.concatenate([features, sin_cos, agg_month, city_onehot_repeated], axis=1)

    df["Pointwise_Data"] = df.apply(preprocess_pointwise_data, axis=1)

    # 4. Filter for the selected city and sort
    city_df = df[df["City"] == city].sort_values("Aggregate_Month").reset_index(drop=True)

    # 5. Stack into a 3D tensor: (num_months, 1000, num_features)
    stacked = np.stack(city_df["Pointwise_Data"].to_numpy())
    num_months, num_points, num_features = stacked.shape

    X_list = []
    y_list = []
    def clean_features(x_seq):
        x_seq = x_seq.copy()

        invalid_lst_mask = (x_seq[:, LST_IDX] < LST_MIN) | (x_seq[:, LST_IDX] > LST_MAX) | np.isnan(x_seq[:, LST_IDX])
        invalid_ndvi_mask = (x_seq[:, NDVI_IDX] < NDVI_MIN) | (x_seq[:, NDVI_IDX] > NDVI_MAX) | np.isnan(x_seq[:, NDVI_IDX])
        invalid_albedo_mask = (x_seq[:, ALBEDO_IDX] < ALBEDO_MIN) | (x_seq[:, ALBEDO_IDX] > ALBEDO_MAX) | np.isnan(x_seq[:, ALBEDO_IDX])
        invalid_mask=invalid_lst_mask|invalid_albedo_mask|invalid_ndvi_mask

        x_seq[invalid_mask, 0:4] = np.nan
        nan_count=np.sum(invalid_mask)
        return x_seq, nan_count
    
    for i in range(num_months - seq_length):
        seq_x = stacked[i : i + seq_length]
        target = stacked[i + seq_length]

        for p in range(num_points):
            x_seq = seq_x[:, p, :].astype(np.float32)
            y_val = float(target[p, target_col_idx])
            clean_x, nan_count=clean_features(x_seq)
            if not (LST_MIN <= y_val <= LST_MAX) or np.isnan(y_val):
                continue
            if nan_count>24:
                continue
            X_list.append(x_seq)
            y_list.append(y_val)

    X = np.stack(X_list).astype(np.float32)  # shape: (n, seq_len, num_features)
    y = np.array(y_list).astype(np.float32)  # shape: (n,)
    return X, y


In [None]:
import numpy as np
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

cities = ["Delhi", "Hyderabad", "Mumbai", "Bangalore", "Kolkata",
           "Pune", "Kanpur", "Surat", "Ahmedabad"]

def process_city(city):
    x, y = batching(df, city, 72)
    return x, y

if __name__ == "__main__":
    All_x = []
    All_y = []

    with ProcessPoolExecutor() as executor:
        futures = [executor.submit(process_city, city) for city in cities]

        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing cities"):
            x, y = future.result()
            All_x.append(x)
            All_y.append(y)

    inputs = np.concatenate(All_x)
    targets = np.concatenate(All_y)


Processing cities: 100%|██████████| 10/10 [01:07<00:00,  6.70s/it]


In [16]:
# cities=["Delhi","Hyderabad", "Mumbai", "Bangalore","Kolkata","Chennai","Pune",\
#         "Kanpur","Surat","Ahmedabad"]

# All_x=[]
# All_y=[]
# for city in cities:
#     x,y=batching(df,city,12)
#     All_x.append(x)
#     All_y.append(y)
# inputs=np.concatenate(All_x)
# targets=np.concatenate(All_y)

In [17]:
inputs.shape

(738094, 36, 17)

In [None]:
# import pickle

# # Save inputs
# with open("inputs_36_v_clean_10k.pkl", "wb") as f:
#     pickle.dump(inputs, f)

# # Save targets
# with open("targets_36_v_clean_10k.pkl", "wb") as f:
#     pickle.dump(targets, f)
