In [1]:
import xarray as xr
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import joblib
# Load dataset
file_path = "Dataset/v4_final_dataset/final_weather_data.nc"
dataset = xr.open_dataset(file_path)
print(dataset)

<xarray.Dataset> Size: 458MB
Dimensions:    (time: 35064, latitude: 11, longitude: 33)
Coordinates:
    number     int64 8B ...
  * time       (time) datetime64[ns] 281kB 2021-01-01 ... 2024-12-31T23:00:00
    surface    float64 8B ...
  * latitude   (latitude) float64 88B 49.0 48.75 48.5 48.25 ... 47.0 46.75 46.5
  * longitude  (longitude) float64 264B 9.5 9.75 10.0 10.25 ... 17.0 17.25 17.5
Data variables:
    d2m        (time, latitude, longitude) float32 51MB ...
    t2m        (time, latitude, longitude) float32 51MB ...
    sp         (time, latitude, longitude) float32 51MB ...
    u10        (time, latitude, longitude) float32 51MB ...
    v10        (time, latitude, longitude) float32 51MB ...
    tcc        (time, latitude, longitude) float32 51MB ...
    cp         (time, latitude, longitude) float32 51MB ...
    lsp        (time, latitude, longitude) float32 51MB ...
    tp         (time, latitude, longitude) float32 51MB ...
Attributes:
    GRIB_edition:            1
    G

In [2]:
variables = ['d2m', 't2m', 'sp', 'u10', 'v10', 'tcc', 'cp', 'lsp', 'tp']
data_array = np.stack([dataset[var].values for var in variables], axis=-1)
# Get dimensions
n_samples, n_lat, n_lon = data_array.shape[0], dataset.latitude.size, dataset.longitude.size
n_vars = len(variables)
print(f"Raw data shape: {data_array.shape}")  #(35064, 11, 33, 9)



Raw data shape: (35064, 11, 33, 9)


In [4]:
# Clean all variables
temp_idx = variables.index('t2m')
sp_idx = variables.index('sp')
precip_indices = [variables.index(var) for var in ['tp', 'cp', 'lsp']]
cloud_idx = variables.index('tcc')

# Clean temperature
data_array[..., temp_idx] = np.where(data_array[..., temp_idx] <= 0, 250.0, data_array[..., temp_idx])
print(f"Cleaned t2m min/max: {np.min(data_array[..., temp_idx]):.2f}K / {np.max(data_array[..., temp_idx]):.2f}K")

# Clean surface pressure (convert Pa to hPa)
data_array[..., sp_idx] = data_array[..., sp_idx] / 100
print(f"Adjusted sp min/max: {np.min(data_array[..., sp_idx]):.2f} hPa / {np.max(data_array[..., sp_idx]):.2f} hPa")

# Clean precipitation (convert m to mm and set negative to 0)
for idx in precip_indices:
    data_array[..., idx] = np.where(data_array[..., idx] < 0, 0.0, data_array[..., idx] * 1000)  # Convert m to mm
print(f"Cleaned tp min/max: {np.min(data_array[..., precip_indices[0]]):.2f} mm / {np.max(data_array[..., precip_indices[0]]):.2f} mm")

# Clean cloud cover (clip to [0, 1])
data_array[..., cloud_idx] = np.clip(data_array[..., cloud_idx], 0.0, 1.0)
print(f"Cleaned tcc min/max: {np.min(data_array[..., cloud_idx]):.2f} / {np.max(data_array[..., cloud_idx]):.2f}")

Cleaned t2m min/max: 242.87K / 310.65K
Adjusted sp min/max: 727.33 hPa / 1030.46 hPa
Cleaned tp min/max: 0.00 mm / 20.46 mm
Cleaned tcc min/max: 0.00 / 1.00


In [5]:
# Reshape for single scaler: (n_samples * n_lat * n_lon, n_vars)
data_flat = data_array.reshape(-1, n_vars)  # Shape: (n_samples * 11 * 33, 9)
print(f"Flattened data shape: {data_flat.shape}")  # Should be (35064 * 11 * 33, 9)

# Fit and transform with a single scaler
scaler = MinMaxScaler()
normalized_data_flat = scaler.fit_transform(data_flat)
print(f"Scaler n_features_in_: {scaler.n_features_in_}")  # Should be 9
print(f"Scaler data_min_: {scaler.data_min_}")  # Expected: min per variable
print(f"Scaler data_max_: {scaler.data_max_}")  # Expected: max per variable

# Reshape back to original
normalized_data = normalized_data_flat.reshape(n_samples, n_lat, n_lon, n_vars)
print(f"Normalized data shape: {normalized_data.shape}")  # Should be (35064, 11, 33, 9)

Flattened data shape: (12728232, 9)
Scaler n_features_in_: 9
Scaler data_min_: [230.5844   242.86975  727.32623   -8.344498 -13.136108   0.
   0.         0.         0.      ]
Scaler data_max_: [2.9779272e+02 3.1065259e+02 1.0304587e+03 1.2643265e+01 1.1074310e+01
 1.0000000e+00 1.3957739e+01 1.6366959e+01 2.0457029e+01]
Normalized data shape: (35064, 11, 33, 9)


In [6]:
# Save scaler and data
joblib.dump(scaler, 'data/scaler.pkl')
np.save('data/normalized_data_single_.npy', normalized_data)


In [8]:
# Split and save test data (adjust ratios as needed)
train_split = int(0.7 * n_samples)
val_split = int(0.85 * n_samples)
train_data = normalized_data[:train_split]
val_data = normalized_data[train_split:val_split]
test_data = normalized_data[val_split:]

print(f"Training data shape: {train_data.shape}")  # ~24,545 hours
print(f"Validation data shape: {val_data.shape}")  # ~5,260 hours
print(f"Test data shape: {test_data.shape}")  # ~5,260 hours

Training data shape: (24544, 11, 33, 9)
Validation data shape: (5260, 11, 33, 9)
Test data shape: (5260, 11, 33, 9)


In [9]:
# Generate X, y pairs for model training (using 24-hour sequences)
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])  # Input sequence
        y.append(data[i+seq_length])    # Target value (next timestep)
    return np.array(X), np.array(y)
    
X_train, y_train = create_sequences(train_data, 24)
X_val, y_val = create_sequences(val_data, 24)
X_test, y_test = create_sequences(test_data, 24)

In [10]:
# After creating and splitting and transposing sequences

# Step 9: Save Data
np.save('data/X_train.npy', X_train)
np.save('data/y_train.npy', y_train)
np.save('data/X_val.npy', X_val)
np.save('data/y_val.npy', y_val)
np.save('data/X_test.npy', X_test)
np.save('data/y_test.npy', y_test)

print("Preprocessing complete. Saved datasets with shapes:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val:", X_val.shape, "y_val:", y_val.shape)
print("X_test:", X_test.shape, "y_test:", y_test.shape)




Preprocessing complete. Saved datasets with shapes:
X_train: (24520, 24, 11, 33, 9) y_train: (24520, 11, 33, 9)
X_val: (5236, 24, 11, 33, 9) y_val: (5236, 11, 33, 9)
X_test: (5236, 24, 11, 33, 9) y_test: (5236, 11, 33, 9)


In [3]:
# Smaller Testset for deployment

X_test = np.load('data/X_test.npy')  # (5236, 24, 11, 33, 9)
X_test_small = X_test[-318:]  # Last 318 sequences (~99.8 MB)
np.save('data/X_test_small.npy', X_test_small)
print(f"X_test_small shape: {X_test_small.shape}")

X_test_small shape: (318, 24, 11, 33, 9)
