<a href="https://colab.research.google.com/github/Savith-02/notebooks/blob/main/adding_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, RepeatVector
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

def setup_gpu():
    physical_devices = tf.config.list_physical_devices('GPU')
    if physical_devices:
        print(f"Found {len(physical_devices)} GPU(s):")
        for i, device in enumerate(physical_devices):
            print(f"  GPU {i}: {device.name}")

        try:
            for device in physical_devices:
                tf.config.experimental.set_memory_growth(device, True)
            print("GPU memory growth enabled")

            tf.keras.mixed_precision.set_global_policy('mixed_float16')
            print("Mixed precision policy set to mixed_float16")

            return True
        except Exception as e:
            print(f"Error configuring GPU: {str(e)}")
            return False
    else:
        print("No GPU found. Using CPU.")
        return False

is_using_gpu = setup_gpu()

SEED = 42
def set_seeds(seed_value):
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)
    tf.random.set_seed(seed_value)

    if tf.config.list_physical_devices('GPU'):
        try:
            os.environ['TF_DETERMINISTIC_OPS'] = '1'
            os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
            tf.config.experimental.enable_op_determinism()
        except Exception as e:
            print(f"Warning: Could not enable deterministic operations in TensorFlow: {e}")

set_seeds(SEED)
print(f"Seed set to {SEED}")

url = 'https://drive.google.com/uc?id=1DsKaiLMxsT-VHKwqm4vvtANCIjadi1-r'
df = pd.read_csv(url)

Found 1 GPU(s):
  GPU 0: /physical_device:GPU:0
GPU memory growth enabled
Mixed precision policy set to mixed_float16
Seed set to 42


In [5]:
def add_kingdom_id(df):
    unique_kingdoms = df['kingdom'].unique()
    kingdom_to_id = {kingdom: i + 1 for i, kingdom in enumerate(unique_kingdoms)}
    df['kingdom_ID'] = df['kingdom'].map(kingdom_to_id)
    return df

df = add_kingdom_id(df)

In [11]:
# Configuration parameters
EPOCHS = 30
BATCH_SIZE = 32
SEQ_LENGTH = 60
PRED_LENGTH = 8


In [30]:

def prepare_data(df):
    end_year, end_month, end_day = map(int, TRAINING_DATE_END.split('-'))
    df = df.dropna(subset=['kingdom'])

    numeric_columns = ['Avg_Temperature','Radiation', 'Rain_Amount',
                      'Wind_Speed', 'Wind_Direction',
                       'latitude', 'longitude', 'Year', 'Month', 'Day']

    for col in numeric_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    for col in numeric_columns:
        if col in df.columns and df[col].isna().any():
            if col in ['Year', 'Month', 'Day']:
                df = df.dropna(subset=[col])
            else:
                median_val = df[col].median()
                df[col] = df[col].fillna(median_val)
                print(f"Filled NaN values in {col} with median: {median_val}")

    date_mask = ((df['Year'] < end_year) |
                ((df['Year'] == end_year) & (df['Month'] < end_month)) |
                ((df['Year'] == end_year) & (df['Month'] == end_month) & (df['Day'] <= end_day)))

    df = df[date_mask]
    print(f"Data filtered until date: Year {end_year}, Month {end_month}, Day {end_day}")

    kingdoms = sorted(df['kingdom'].unique())[:NUM_KINGDOMS]
    kingdom_to_id = {kingdom: idx + 1 for idx, kingdom in enumerate(kingdoms)}

    df = df[df['kingdom'].isin(kingdoms)]
    df['kingdom_ID'] = df['kingdom'].map(kingdom_to_id)

    kingdom_info = {}
    for kingdom in kingdoms:
        k_data = df[df['kingdom'] == kingdom].iloc[0]
        kingdom_info[kingdom] = {
            'id': int(k_data['kingdom_ID']),
            'latitude': float(k_data['latitude']),
            'longitude': float(k_data['longitude'])
        }

    print(f"Found {len(kingdoms)} kingdoms in the data")
    return df, kingdoms, kingdom_info

def scale_features(data, features):
    scalers = {}
    scaled_data = data.copy()
    non_scaled_features = ['Year', 'Month', 'Day', 'kingdom_ID']

    for feature in features:
        if feature in non_scaled_features:
            scalers[feature] = None
            continue

        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_data[feature] = scaler.fit_transform(data[feature].values.reshape(-1, 1))
        scalers[feature] = scaler

    return scaled_data, scalers

def create_sequences_optimized(data, input_features, output_features, seq_length=SEQ_LENGTH, pred_length=PRED_LENGTH):
    sequences = []
    targets = []

    for kingdom_id in data['kingdom_ID'].unique():
        kingdom_data = data[data['kingdom_ID'] == kingdom_id].copy()
        kingdom_data.loc[:, 'date_tuple'] = list(zip(kingdom_data['Year'], kingdom_data['Month'], kingdom_data['Day']))
        unique_dates = sorted(kingdom_data['date_tuple'].unique())

        if len(unique_dates) < seq_length + pred_length:
            print(f"Not enough data for kingdom ID {kingdom_id}")
            continue

        date_to_row = {}
        for _, row in kingdom_data.iterrows():
            date_tuple = (row['Year'], row['Month'], row['Day'])
            date_to_row[date_tuple] = row

        for i in range(0, len(unique_dates) - seq_length - pred_length + 1, 8):
            seq_dates = unique_dates[i:i+seq_length]
            target_dates = unique_dates[i+seq_length:i+seq_length+pred_length]

            seq_data = []
            target_data = []

            all_dates_found = True
            for date in seq_dates:
                if date in date_to_row:
                    seq_data.append(date_to_row[date][input_features].values)
                else:
                    all_dates_found = False
                    break

            if not all_dates_found:
                continue

            all_dates_found = True
            for date in target_dates:
                if date in date_to_row:
                    target_data.append(date_to_row[date][output_features].values)
                else:
                    all_dates_found = False
                    break

            if all_dates_found and len(seq_data) == seq_length and len(target_data) == pred_length:
                sequences.append(np.array(seq_data))
                targets.append(np.array(target_data))

    sequences_array = np.array(sequences, dtype=np.float32)
    targets_array = np.array(targets, dtype=np.float32)

    print(f"Created {len(sequences)} sequence-target pairs")
    return sequences_array, targets_array

def build_model(input_shape, output_timesteps, num_features):
    tf.keras.utils.set_random_seed(SEED)

    inputs = Input(shape=input_shape)

    encoded = LSTM(128, return_sequences=False,
                  kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(inputs)
    encoded = Dropout(0.3)(encoded)

    decoded = RepeatVector(output_timesteps)(encoded)
    decoded = LSTM(128, return_sequences=True,
                  kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(decoded)
    decoded = Dropout(0.3)(decoded)
    decoded = LSTM(64, return_sequences=True,
                  kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(decoded)

    outputs = Dense(num_features)(decoded)

    model = Model(inputs=inputs, outputs=outputs)

    if is_using_gpu:
        optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
        model.compile(optimizer=optimizer, loss='mean_squared_error')
    else:
        model.compile(optimizer='adam', loss='mean_squared_error')

    return model

def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8))


def train_global_model(X_train, y_train, scalers, patience=5):
    print("\n===== Training Global Model for All Kingdoms =====")

    if X_train.size == 0 or y_train.size == 0:
        print("Error: No training sequences could be created. Check your data.")
        return None, []

    print(f"Global training sequences: {X_train.shape}, {y_train.shape}")

    input_shape = (X_train.shape[1], X_train.shape[2])
    output_timesteps = y_train.shape[1]
    output_features = y_train.shape[2]

    model = build_model(input_shape, output_timesteps, output_features)
    model.summary()

    callbacks = []
    if is_using_gpu:
        callbacks.append(
            tf.keras.callbacks.TensorBoard(
                log_dir='./logs',
                histogram_freq=1,
                profile_batch='500,520'
            )
        )

    best_smape = float('inf')
    best_model_weights = None
    patience_counter = 0
    all_metrics = {'train_loss': [], 'mse': [], 'smape': []}

    print("\n===== Starting Training Loop with Early Stopping (SMAPE-based) =====")

    effective_batch_size = BATCH_SIZE * 2 if is_using_gpu else BATCH_SIZE
    print(f"Using batch size: {effective_batch_size}")

    for epoch in range(EPOCHS):
        print(f"Epoch {epoch+1}/{EPOCHS}")

        history = model.fit(
            X_train, y_train,
            epochs=1,
            batch_size=effective_batch_size,
            verbose=1,
            callbacks=callbacks
        )

        train_loss = history.history['loss'][0]
        all_metrics['train_loss'].append(train_loss)

        train_preds = model.predict(X_train, verbose=0)
        train_mse, train_smape_val, _, _ = evaluate_predictions(y_train, train_preds, scalers)
        all_metrics['mse'].append(train_mse)
        all_metrics['smape'].append(train_smape_val)

        print(f"Train Loss: {train_loss:.4f}, MSE: {train_mse:.4f}, SMAPE: {train_smape_val:.2f}%")

        if train_smape_val < best_smape:
            best_smape = train_smape_val
            best_model_weights = model.get_weights()
            patience_counter = 0
            print(f"SMAPE improved to {best_smape:.2f}%")
        else:
            patience_counter += 1
            print(f"SMAPE did not improve. Patience: {patience_counter}/{patience}")

            if patience_counter >= patience:
                print(f"Early stopping triggered after {epoch+1} epochs")
                break

    if best_model_weights is not None:
        model.set_weights(best_model_weights)
        print(f"Restored model weights from best epoch with SMAPE: {best_smape:.2f}%")

    try:
        model.save("kingdom_weather_model.h5")
        print("Model saved to kingdom_weather_model.h5")
    except Exception as e:
        print(f"Could not save model: {str(e)}")
        try:
            model.save_weights("kingdom_weather_model_weights.h5")
            print("Model weights saved to kingdom_weather_model_weights.h5")
        except Exception as e2:
            print(f"Could not save model weights either: {str(e2)}")

    train_preds = model.predict(X_train, verbose=0)
    train_mse, train_smape_val, _, _ = evaluate_predictions(y_train, train_preds, scalers)
    print(f"Final Train MSE: {train_mse:.4f}, SMAPE: {train_smape_val:.2f}%")

    plt.figure(figsize=(15, 5))

    plt.subplot(1, 3, 1)
    plt.plot(all_metrics['train_loss'], 'b-', label='Training Loss')
    best_epoch = len(all_metrics['smape']) - patience_counter - 1
    plt.axvline(x=best_epoch, color='r', linestyle='--', label='Best Model')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.grid(True, alpha=0.3)
    plt.legend()

    plt.subplot(1, 3, 2)
    plt.plot(all_metrics['mse'], 'g-', label='MSE')
    plt.axvline(x=best_epoch, color='r', linestyle='--', label='Best Model')
    plt.title('Model MSE')
    plt.xlabel('Epoch')
    plt.ylabel('MSE')
    plt.grid(True, alpha=0.3)
    plt.legend()

    plt.subplot(1, 3, 3)
    plt.plot(all_metrics['smape'], 'm-', label='SMAPE')
    plt.axvline(x=best_epoch, color='r', linestyle='--', label='Best Model')
    plt.title('Model SMAPE')
    plt.xlabel('Epoch')
    plt.ylabel('SMAPE (%)')
    plt.grid(True, alpha=0.3)
    plt.legend()

    plt.tight_layout()
    plt.savefig('training_curves.png')
    plt.close()

    return model, all_metrics


def evaluate_predictions(y_true, y_pred, scalers):
    if y_true.size == 0 or y_pred.size == 0:
        return float('nan'), float('nan'), None, None

    y_pred_unscaled = np.zeros_like(y_pred)
    y_true_unscaled = np.zeros_like(y_true)

    for i, feature in enumerate(OUTPUT_FEATURES):
        scaler = scalers[feature]
        if scaler is not None:
            y_pred_unscaled[:, :, i] = scaler.inverse_transform(y_pred[:, :, i])
            y_true_unscaled[:, :, i] = scaler.inverse_transform(y_true[:, :, i])
        else:
            y_pred_unscaled[:, :, i] = y_pred[:, :, i]
            y_true_unscaled[:, :, i] = y_true[:, :, i]

    mse = mean_squared_error(y_true_unscaled.flatten(), y_pred_unscaled.flatten())
    smape_val = smape(y_true_unscaled.flatten(), y_pred_unscaled.flatten())

    return mse, smape_val, y_true_unscaled, y_pred_unscaled


In [23]:
# New function to create lag features and rolling statistics
def add_engineered_features(df, windows=[3, 7, 14], lags=[1, 3, 7]):
    """
    Add lag features and rolling statistics to the dataframe.

    Parameters:
        df: DataFrame containing weather data
        windows: List of window sizes for rolling statistics
        lags: List of lag values to create

    Returns:
        DataFrame with added features and a list of new feature names
    """
    # Sort by kingdom and date to ensure correct lag calculation
    df = df.sort_values(by=['kingdom_ID', 'Year', 'Month', 'Day']).copy()

    # Group by kingdom for feature engineering
    grouped = df.groupby('kingdom_ID')

    # Features to apply engineering to
    base_features = ['Avg_Temperature', 'Radiation', 'Rain_Amount', 'Wind_Speed', 'Wind_Direction']
    new_features = []

    # Add lag features
    for lag in lags:
        for feature in base_features:
            lag_col = f'{feature}_lag_{lag}'
            df[lag_col] = grouped[feature].shift(lag)
            new_features.append(lag_col)

    # Add rolling statistics
    for window in windows:
        # Rolling mean
        for feature in base_features:
            mean_col = f'{feature}_rollmean_{window}'
            df[mean_col] = grouped[feature].transform(lambda x: x.rolling(window, min_periods=1).mean())
            new_features.append(mean_col)

        # Rolling standard deviation (measure of volatility)
        for feature in base_features:
            std_col = f'{feature}_rollstd_{window}'
            df[std_col] = grouped[feature].transform(lambda x: x.rolling(window, min_periods=1).std())
            new_features.append(std_col)

        # Rolling min and max
        for feature in base_features:
            min_col = f'{feature}_rollmin_{window}'
            max_col = f'{feature}_rollmax_{window}'
            df[min_col] = grouped[feature].transform(lambda x: x.rolling(window, min_periods=1).min())
            df[max_col] = grouped[feature].transform(lambda x: x.rolling(window, min_periods=1).max())
            new_features.append(min_col)
            new_features.append(max_col)

    # Add derived features (differences, ratios, etc.)

    # Temperature change rate
    df['Temp_change_rate'] = grouped['Avg_Temperature'].diff()
    new_features.append('Temp_change_rate')

    # Add cyclical encoding for month and day
    # These features capture the cyclical nature of time
    df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
    df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
    df['Day_sin'] = np.sin(2 * np.pi * df['Day'] / 31)
    df['Day_cos'] = np.cos(2 * np.pi * df['Day'] / 31)
    new_features.extend(['Month_sin', 'Month_cos', 'Day_sin', 'Day_cos'])

    # Fill NaN values (created by lag/rolling operations at the beginning of sequences)
    for feature in new_features:
        if feature in df.columns:
            df[feature] = df[feature].fillna(method='bfill')
            # If still NaN (at the end of the sequence), use forward fill
            df[feature] = df[feature].fillna(method='ffill')
            # If still NaN (isolated kingdoms), use median
            df[feature] = df[feature].fillna(df[feature].median())

    return df, new_features


# Function to evaluate different feature combinations
def evaluate_feature_combinations(data, kingdoms, feature_sets, num_epochs=10, test_kingdoms=None):
    """
    Evaluate different combinations of features to find the best performing set.

    Parameters:
        data: Original DataFrame
        kingdoms: List of all kingdoms
        feature_sets: List of lists of features to try
        num_epochs: Number of epochs to train each model
        test_kingdoms: Optional subset of kingdoms to test on (for faster iteration)

    Returns:
        DataFrame with results for each feature combination
    """
    # If no test_kingdoms provided, use a subset
    if test_kingdoms is None:
        test_kingdoms = kingdoms[:5]  # Use first 5 kingdoms for quick testing

    print(f"Testing with {len(test_kingdoms)} kingdoms: {test_kingdoms}")

    results = []

    for i, feature_set in enumerate(feature_sets):
        print(f"\n--- Testing Feature Set {i+1}/{len(feature_sets)} ---")
        print(f"Features: {feature_set}")

        # Prepare data with the current feature set
        test_data = data[data['kingdom'].isin(test_kingdoms)].copy()

        # Update input features with the current feature set
        current_input_features = ['Year', 'Month', 'Day', 'kingdom_ID', 'latitude', 'longitude'] + feature_set

        # Scale features
        scaled_data, scalers = scale_features(test_data, current_input_features)

        # Create sequences
        X_train, y_train = create_sequences_optimized(scaled_data, current_input_features, OUTPUT_FEATURES)

        if X_train.size == 0 or y_train.size == 0:
            print("Error: No training sequences could be created for this feature set. Skipping.")
            continue

        # Build and train a model with these features
        input_shape = (X_train.shape[1], X_train.shape[2])
        output_timesteps = y_train.shape[1]
        output_features = y_train.shape[2]

        model = build_model(input_shape, output_timesteps, output_features)

        # Train for fewer epochs
        best_smape = float('inf')
        all_metrics = {'train_loss': [], 'smape': []}

        effective_batch_size = BATCH_SIZE * 2 if is_using_gpu else BATCH_SIZE

        for epoch in range(num_epochs):
            print(f"Epoch {epoch+1}/{num_epochs}")

            history = model.fit(
                X_train, y_train,
                epochs=1,
                batch_size=effective_batch_size,
                verbose=1
            )

            train_loss = history.history['loss'][0]
            all_metrics['train_loss'].append(train_loss)

            train_preds = model.predict(X_train, verbose=0)
            _, train_smape_val, _, _ = evaluate_predictions(y_train, train_preds, scalers)
            all_metrics['smape'].append(train_smape_val)

            print(f"Train Loss: {train_loss:.4f}, SMAPE: {train_smape_val:.2f}%")

            if train_smape_val < best_smape:
                best_smape = train_smape_val

        # Record results
        results.append({
            'Feature_Set': i + 1,
            'Features': feature_set,
            'Num_Features': len(feature_set),
            'Best_SMAPE': best_smape,
            'Final_Loss': train_loss
        })

        print(f"Feature Set {i+1} - Best SMAPE: {best_smape:.2f}%")

    # Compile results and return
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('Best_SMAPE')

    print("\n--- Feature Combination Results ---")
    print(results_df)

    # Save results to CSV
    results_df.to_csv('feature_combination_results.csv', index=False)

    return results_df



In [24]:

TRAINING_DATE_END = ('8-7-31')
SUBMISSION_DATE_START = ('9-1-1')
SUMMISSION_DATE_END = ('9-5-31')

NUM_KINGDOMS = 30

# Features for global model (includes spatial information)
GLOBAL_INPUT_FEATURES = [
    'Year', 'Month', 'Day', 'Avg_Temperature',
    'Radiation', 'Rain_Amount', 'Wind_Speed',
    'Wind_Direction', 'kingdom_ID', 'latitude', 'longitude'
]

# Output features
OUTPUT_FEATURES = [
    'Avg_Temperature', 'Radiation', 'Rain_Amount', 'Wind_Speed', 'Wind_Direction'
]

In [26]:
# Setup GPU if available
if tf.config.list_physical_devices('GPU'):
    print("Using GPU")
    for gpu in tf.config.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)
else:
    print("Using CPU")

# Prepare data and model

# Main code to run feature engineering and evaluation

# 1. Add engineered features to the dataset
print("Adding engineered features to the dataset...")
engineered_data, new_features = add_engineered_features(df)
data, kingdoms, kingdom_info = prepare_data(df)

scaled_data, scalers = scale_features(data, GLOBAL_INPUT_FEATURES)


Using GPU
Adding engineered features to the dataset...


  df[feature] = df[feature].fillna(method='bfill')
  df[feature] = df[feature].fillna(method='ffill')


Data filtered until date: Year 8, Month 7, Day 31
Found 30 kingdoms in the data


In [27]:
X_train, y_train = create_sequences_optimized(scaled_data, GLOBAL_INPUT_FEATURES, OUTPUT_FEATURES)

Created 9810 sequence-target pairs


In [28]:


# Show what new features were created
print(f"Created {len(new_features)} new features:")
for i, feature in enumerate(new_features):
    print(f"  {i+1}. {feature}")

# 2. Define feature sets to test
# Start with base features and progressively add more complex ones
base_features = OUTPUT_FEATURES  # Original features
lag_features = [f for f in new_features if 'lag' in f]
roll_mean_features = [f for f in new_features if 'rollmean' in f]
roll_std_features = [f for f in new_features if 'rollstd' in f]
roll_minmax_features = [f for f in new_features if 'rollmin' in f or 'rollmax' in f]
derived_features = ['Temp_change_rate']
cyclical_features = ['Month_sin', 'Month_cos', 'Day_sin', 'Day_cos']

Created 80 new features:
  1. Avg_Temperature_lag_1
  2. Radiation_lag_1
  3. Rain_Amount_lag_1
  4. Wind_Speed_lag_1
  5. Wind_Direction_lag_1
  6. Avg_Temperature_lag_3
  7. Radiation_lag_3
  8. Rain_Amount_lag_3
  9. Wind_Speed_lag_3
  10. Wind_Direction_lag_3
  11. Avg_Temperature_lag_7
  12. Radiation_lag_7
  13. Rain_Amount_lag_7
  14. Wind_Speed_lag_7
  15. Wind_Direction_lag_7
  16. Avg_Temperature_rollmean_3
  17. Radiation_rollmean_3
  18. Rain_Amount_rollmean_3
  19. Wind_Speed_rollmean_3
  20. Wind_Direction_rollmean_3
  21. Avg_Temperature_rollstd_3
  22. Radiation_rollstd_3
  23. Rain_Amount_rollstd_3
  24. Wind_Speed_rollstd_3
  25. Wind_Direction_rollstd_3
  26. Avg_Temperature_rollmin_3
  27. Avg_Temperature_rollmax_3
  28. Radiation_rollmin_3
  29. Radiation_rollmax_3
  30. Rain_Amount_rollmin_3
  31. Rain_Amount_rollmax_3
  32. Wind_Speed_rollmin_3
  33. Wind_Speed_rollmax_3
  34. Wind_Direction_rollmin_3
  35. Wind_Direction_rollmax_3
  36. Avg_Temperature_rollmean_

In [31]:
# Create combinations of feature sets to test
feature_sets = [
    base_features,  # Baseline
    base_features + lag_features,  # Base + lags
    base_features + roll_mean_features,  # Base + rolling means
    base_features + roll_std_features,  # Base + rolling stds
    base_features + roll_minmax_features,  # Base + rolling min/max
    base_features + cyclical_features,  # Base + cyclical time features
    base_features + lag_features + roll_mean_features,  # Base + lags + means
    base_features + lag_features + cyclical_features,  # Base + lags + cyclical
    base_features + roll_mean_features + cyclical_features,  # Base + means + cyclical
    base_features + lag_features + roll_mean_features + cyclical_features,  # Complex combination
]

# 3. Test on a subset of kingdoms for faster iteration
test_kingdoms = kingdoms[:10]  # Use first 10 kingdoms
feature_results = evaluate_feature_combinations(
    engineered_data,
    kingdoms,
    feature_sets,
    num_epochs=10,  # Reduced epochs for faster testing
    test_kingdoms=test_kingdoms
)

Testing with 10 kingdoms: ['Arcadia', 'Atlantis', 'Avalon', 'Camelot', 'Dorne', 'Eden', 'El Dorado', 'Elysium', 'Emerald City', 'Helios']

--- Testing Feature Set 1/10 ---
Features: ['Avg_Temperature', 'Radiation', 'Rain_Amount', 'Wind_Speed', 'Wind_Direction']
Created 3460 sequence-target pairs
Epoch 1/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - loss: 0.0890
Train Loss: 0.0520, SMAPE: 40.75%
Epoch 2/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.0252
Train Loss: 0.0236, SMAPE: 37.92%
Epoch 3/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 0.0200
Train Loss: 0.0195, SMAPE: 37.37%
Epoch 4/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.0179
Train Loss: 0.0178, SMAPE: 36.10%
Epoch 5/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.0174
Train Loss: 0.0175, SMAPE: 37.48%
Epoch 6/10
[1m55/55[0m [32m━━━

In [33]:
# 4. Train final model with best feature set
best_feature_set_idx = feature_results.iloc[0]['Feature_Set'] - 1
best_features = feature_sets[best_feature_set_idx]

print(f"\n--- Training Final Model with Best Feature Set ---")
print(f"Best features: {best_features}")

# Update global input features
FINAL_INPUT_FEATURES = ['Year', 'Month', 'Day', 'kingdom_ID', 'latitude', 'longitude'] + best_features

# Prepare data with best features
data, kingdoms, kingdom_info = prepare_data(engineered_data)
scaled_data, scalers = scale_features(data, FINAL_INPUT_FEATURES)
X_train, y_train = create_sequences_optimized(scaled_data, FINAL_INPUT_FEATURES, OUTPUT_FEATURES)

# Train the final model with full dataset and original parameters
model, history = train_global_model(X_train, y_train, scalers)

# Save the optimal feature set for future reference
with open('best_features.txt', 'w') as f:
    f.write("Best Feature Set:\n")
    for feature in best_features:
        f.write(f"{feature}\n")

print("Feature engineering and model training completed.")


--- Training Final Model with Best Feature Set ---
Best features: ['Avg_Temperature', 'Radiation', 'Rain_Amount', 'Wind_Speed', 'Wind_Direction', 'Avg_Temperature_rollmin_3', 'Avg_Temperature_rollmax_3', 'Radiation_rollmin_3', 'Radiation_rollmax_3', 'Rain_Amount_rollmin_3', 'Rain_Amount_rollmax_3', 'Wind_Speed_rollmin_3', 'Wind_Speed_rollmax_3', 'Wind_Direction_rollmin_3', 'Wind_Direction_rollmax_3', 'Avg_Temperature_rollmin_7', 'Avg_Temperature_rollmax_7', 'Radiation_rollmin_7', 'Radiation_rollmax_7', 'Rain_Amount_rollmin_7', 'Rain_Amount_rollmax_7', 'Wind_Speed_rollmin_7', 'Wind_Speed_rollmax_7', 'Wind_Direction_rollmin_7', 'Wind_Direction_rollmax_7', 'Avg_Temperature_rollmin_14', 'Avg_Temperature_rollmax_14', 'Radiation_rollmin_14', 'Radiation_rollmax_14', 'Rain_Amount_rollmin_14', 'Rain_Amount_rollmax_14', 'Wind_Speed_rollmin_14', 'Wind_Speed_rollmax_14', 'Wind_Direction_rollmin_14', 'Wind_Direction_rollmax_14']
Data filtered until date: Year 8, Month 7, Day 31
Found 30 kingdoms i


===== Starting Training Loop with Early Stopping (SMAPE-based) =====
Using batch size: 64
Epoch 1/30
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - loss: 0.0561
Train Loss: 0.0314, MSE: 981.4008, SMAPE: 38.84%
SMAPE improved to 38.84%
Epoch 2/30
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 0.0174
Train Loss: 0.0168, MSE: 912.5231, SMAPE: 38.53%
SMAPE improved to 38.53%
Epoch 3/30
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - loss: 0.0154
Train Loss: 0.0147, MSE: 848.6556, SMAPE: 35.66%
SMAPE improved to 35.66%
Epoch 4/30
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - loss: 0.0134
Train Loss: 0.0131, MSE: 809.1466, SMAPE: 34.93%
SMAPE improved to 34.93%
Epoch 5/30
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 0.0128
Train Loss: 0.0125, MSE: 815.6398, SMAPE: 34.21%
SMAPE improved to 34.21%
Epoch 6/30
[1m154/154[0m [



Train Loss: 0.0093, MSE: 569.1257, SMAPE: 31.85%
SMAPE did not improve. Patience: 5/5
Early stopping triggered after 25 epochs
Restored model weights from best epoch with SMAPE: 31.83%
Model saved to kingdom_weather_model.h5
Final Train MSE: 634.4258, SMAPE: 31.83%
Feature engineering and model training completed.


In [35]:







def recursive_forecast(model, data, kingdom, start_date, num_predictions, kingdom_info, scalers, seq_length=SEQ_LENGTH, pred_length=None):
    print(f"\n===== Starting Recursive Forecast for {kingdom} =====")
    print(f"Requested {num_predictions} total days of predictions")

    k_id = kingdom_info[kingdom]['id']
    lat = kingdom_info[kingdom]['latitude']
    lon = kingdom_info[kingdom]['longitude']

    kingdom_data = data[data['kingdom'] == kingdom].sort_values(by=['Year', 'Month', 'Day'])
    print(f"Found {len(kingdom_data)} records for {kingdom}")

    if len(kingdom_data) == 0:
        print(f"Error: No data found for kingdom '{kingdom}' (ID={k_id})")
        return pd.DataFrame()

    min_date = (int(kingdom_data['Year'].min()), int(kingdom_data['Month'].min()), int(kingdom_data['Day'].min()))
    max_date = (int(kingdom_data['Year'].max()), int(kingdom_data['Month'].max()), int(kingdom_data['Day'].max()))
    print(f"Date range for {kingdom}: {min_date} to {max_date}")
    print(f"Attempting to start predictions from: {start_date}")

    start_year, start_month, start_day = start_date

    filtered_data = kingdom_data[
        (kingdom_data['Year'] == start_year) &
        (kingdom_data['Month'] == start_month) &
        (kingdom_data['Day'] == start_day)
    ]

    if len(filtered_data) == 0:
        print(f"Error: Start date {start_date} not found for {kingdom}. Checking nearby dates...")

        dates_in_month = kingdom_data[
            (kingdom_data['Year'] == start_year) &
            (kingdom_data['Month'] == start_month)
        ]

        if len(dates_in_month) > 0:
            closest_day = dates_in_month['Day'].iloc[0]
            for day in dates_in_month['Day']:
                if abs(day - start_day) < abs(closest_day - start_day):
                    closest_day = day

            print(f"Found closest day: {start_year}-{start_month}-{closest_day}")
            start_day = closest_day
            start_date = (start_year, start_month, start_day)

            filtered_data = kingdom_data[
                (kingdom_data['Year'] == start_year) &
                (kingdom_data['Month'] == start_month) &
                (kingdom_data['Day'] == start_day)
            ]
        else:
            print(f"No dates found in month {start_year}-{start_month} for {kingdom}")
            return pd.DataFrame()

    if len(filtered_data) == 0:
        print(f"Error: Still could not find suitable start date for {kingdom}")
        return pd.DataFrame()

    start_idx = filtered_data.index[0]
    all_indices = kingdom_data.index.tolist()

    try:
        position = all_indices.index(start_idx)
    except ValueError:
        print(f"Error: Could not find start index in kingdom data indices")
        return pd.DataFrame()

    if position < seq_length - 1:
        print(f"Warning: Not enough history before {start_date} for {kingdom}. Using available data.")
        initial_data = kingdom_data.iloc[:position+1]
    else:
        initial_data = kingdom_data.iloc[position-seq_length+1:position+1]

    print(f"Initial sequence has {len(initial_data)} days of data")

    if len(initial_data) < 1:
        print(f"Error: No data available for {kingdom}. Skipping.")
        return pd.DataFrame()

    results = []
    working_data = initial_data.copy()

    round_idx = 0
    while len(results) < num_predictions:
        round_idx += 1
        print(f"Prediction round {round_idx} for {kingdom}")

        latest_data = working_data.tail(seq_length)

        if len(latest_data) < seq_length:
            print(f"Warning: Using {len(latest_data)} days instead of {seq_length} days for sequence.")

        latest_min_date = (
            int(latest_data['Year'].min()),
            int(latest_data['Month'].min()),
            int(latest_data['Day'].min())
        )
        latest_max_date = (
            int(latest_data['Year'].max()),
            int(latest_data['Month'].max()),
            int(latest_data['Day'].max())
        )
        print(f"  Input sequence covers: {latest_min_date} to {latest_max_date}")

        X_seq = []
        for _, row in latest_data.iterrows():
            scaled_row = row.copy()

            for feature in GLOBAL_INPUT_FEATURES:
                if feature not in ['Year', 'Month', 'Day', 'kingdom_ID']:
                    if scalers[feature] is not None:
                        scaled_row[feature] = scalers[feature].transform([[row[feature]]])[0][0]

            X_seq.append(scaled_row[GLOBAL_INPUT_FEATURES].values)

        X_seq = np.array([X_seq], dtype=np.float32)
        print(f"  Input sequence shape: {X_seq.shape}")

        y_pred = model.predict(X_seq, verbose=0)
        print(f"  Prediction output shape: {y_pred.shape}")

        model_pred_length = y_pred.shape[1]
        if pred_length is None:
            pred_length = model_pred_length
            print(f"  Using model's output length: {pred_length} days")

        y_pred_unscaled = np.zeros_like(y_pred)
        for i, feature in enumerate(OUTPUT_FEATURES):
            scaler = scalers[feature]
            if scaler is not None:
                y_pred_unscaled[:, :, i] = scaler.inverse_transform(y_pred[:, :, i])
            else:
                y_pred_unscaled[:, :, i] = y_pred[:, :, i]

        last_date = (
            working_data.iloc[-1]['Year'],
            working_data.iloc[-1]['Month'],
            working_data.iloc[-1]['Day']
        )
        print(f"  Last date in working data: {last_date}")

        pred_dates = []
        current_year, current_month, current_day = last_date

        for day_idx in range(model_pred_length):
            current_day += 1

            days_in_month = pd.Timestamp(year=int(current_year), month=int(current_month), day=1).days_in_month
            if current_day > days_in_month:
                current_day = 1
                current_month += 1
                if current_month > 12:
                    current_month = 1
                    current_year += 1

            pred_dates.append((current_year, current_month, current_day))

        print(f"  Generated {len(pred_dates)} prediction dates from {pred_dates[0]} to {pred_dates[-1]}")

        remaining_preds = num_predictions - len(results)
        print(f"  Remaining predictions needed: {remaining_preds}")

        will_crop = (remaining_preds < model_pred_length)
        if will_crop:
            print(f"  Will use only the first {remaining_preds} of {model_pred_length} predictions from this batch")

        use_count = min(model_pred_length, remaining_preds)

        for i in range(use_count):
            date_tuple = pred_dates[i]
            year, month, day = date_tuple

            new_row = {
                'kingdom': kingdom,
                'kingdom_ID': k_id,
                'Year': int(year),
                'Month': int(month),
                'Day': int(day),
                'latitude': lat,
                'longitude': lon
            }

            for j, feature in enumerate(OUTPUT_FEATURES):
                new_row[feature] = y_pred_unscaled[0, i, j]

            results.append(new_row)
            working_data = pd.concat([working_data, pd.DataFrame([new_row])], ignore_index=True)

        print(f"  Added {use_count} new predictions")
        print(f"  Total predictions so far: {len(results)} / {num_predictions}")

        if len(results) >= num_predictions:
            print(f"  Reached target number of predictions ({num_predictions}). Stopping.")
            break

    results_df = pd.DataFrame(results)
    print(f"Generated {len(results_df)} predictions for {kingdom}")

    if len(results_df) > num_predictions:
        print(f"Warning: Generated {len(results_df)} predictions, but only {num_predictions} were requested.")
        print(f"Trimming to exact requested number.")
        results_df = results_df.head(num_predictions)
    elif len(results_df) < num_predictions:
        print(f"Warning: Only generated {len(results_df)} predictions, but {num_predictions} were requested.")

    if len(results_df) > 0:
        min_pred_date = (
            results_df['Year'].min(),
            results_df['Month'].min(),
            results_df['Day'].min()
        )
        max_pred_date = (
            results_df['Year'].max(),
            results_df['Month'].max(),
            results_df['Day'].max()
        )
        print(f"Predictions cover {min_pred_date} to {max_pred_date}")

    return results_df


# model, history = train_global_model(X_train, y_train, scalers)

submission_mode = False

if model is None:
    print("Error: Model training failed. Exiting.")
else:
    submission_start = tuple(map(int, str(SUBMISSION_DATE_START).split('-')))
    submission_end = tuple(map(int, str(SUMMISSION_DATE_END).split('-')))

    if isinstance(TRAINING_DATE_END, str):
        training_end = tuple(map(int, TRAINING_DATE_END.split('-')))
    else:
        training_end = TRAINING_DATE_END

    if submission_mode:
        start_date = submission_start
        end_date = submission_end
        print(f"Submission mode enabled: Predicting from {start_date} to {end_date}")
    else:
        year, month, day = training_end
        days_in_month = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

        day += 1
        if day > days_in_month[month]:
            day = 1
            month += 1
            if month > 12:
                month = 1
                year += 1

        start_date = (year, month, day)

        year, month, day = submission_start
        day -= 1
        if day == 0:
            month -= 1
            if month == 0:
                month = 12
                year -= 1
            day = days_in_month[month]

        end_date = (year, month, day)
        print(f"Testing mode: Predicting from {start_date} to {end_date}")

    def date_to_days(date_tuple):
        year, month, day = date_tuple
        days_in_month = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

        days = day
        for m in range(1, month):
            days += days_in_month[m]
        days += 365 * year

        return days

    start_days = date_to_days(start_date)
    end_days = date_to_days(end_date)
    num_days = end_days - start_days + 1

    print(f"Predicting {num_days} days from {start_date} to {end_date}")
    all_predictions = []

    for kingdom in kingdoms:
        if submission_mode:
            last_date = training_end
        else:
            year, month, day = start_date
            day -= 1
            if day == 0:
                month -= 1
                if month == 0:
                    month = 12
                    year -= 1
                days_in_month = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
                day = days_in_month[month]

            last_date = (year, month, day)

        print(f"Using latest data point for {kingdom}: Year {last_date[0]}, Month {last_date[1]}, Day {last_date[2]}")

        kingdom_predictions = recursive_forecast(
            model, data, kingdom, last_date, num_days, kingdom_info, scalers)

        if not kingdom_predictions.empty:
            start_condition = (
                (kingdom_predictions['Year'] > start_date[0]) |
                ((kingdom_predictions['Year'] == start_date[0]) &
                 (kingdom_predictions['Month'] > start_date[1])) |
                ((kingdom_predictions['Year'] == start_date[0]) &
                 (kingdom_predictions['Month'] == start_date[1]) &
                 (kingdom_predictions['Day'] >= start_date[2]))
            )

            end_condition = (
                (kingdom_predictions['Year'] < end_date[0]) |
                ((kingdom_predictions['Year'] == end_date[0]) &
                 (kingdom_predictions['Month'] < end_date[1])) |
                ((kingdom_predictions['Year'] == end_date[0]) &
                 (kingdom_predictions['Month'] == end_date[1]) &
                 (kingdom_predictions['Day'] <= end_date[2]))
            )

            filtered_predictions = kingdom_predictions[start_condition & end_condition]
            print(f"Filtered from {len(kingdom_predictions)} to {len(filtered_predictions)} predictions for {kingdom}")
            all_predictions.append(filtered_predictions)
        else:
            print(f"No predictions generated for {kingdom}")

    if all_predictions:
        final_predictions = pd.concat(all_predictions, ignore_index=True)

        output_predictions_path = "kingdom_weather_predictions_submission.csv" if submission_mode else "kingdom_weather_predictions_testing.csv"
        print(f"Saving predictions to {output_predictions_path}")
        final_predictions.to_csv(output_predictions_path, index=False)
        print("Prediction process completed!")
    else:
        print("No predictions were generated for any kingdom.")

# Evaluation phase
if submission_mode:
    predictions_file = 'kingdom_weather_predictions_submission.csv'
    mode_title = "Submission"
else:
    predictions_file = 'kingdom_weather_predictions_testing.csv'
    mode_title = "Testing"

try:
    predictions = pd.read_csv(predictions_file)
    print(f"Loaded {len(predictions)} prediction records")
except FileNotFoundError:
    print(f"Error: {predictions_file} not found.")
    exit(1)

historical_data = df

if submission_mode:
    start_year, start_month, start_day = map(int, SUBMISSION_DATE_START.split('-'))
    end_year, end_month, end_day = map(int, SUMMISSION_DATE_END.split('-'))
else:
    training_end = tuple(map(int, TRAINING_DATE_END.split('-')))
    submission_start = tuple(map(int, SUBMISSION_DATE_START.split('-')))

    year, month, day = training_end
    days_in_month = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    day += 1
    if day > days_in_month[month]:
        day = 1
        month += 1
        if month > 12:
            month = 1
            year += 1
    start_year, start_month, start_day = year, month, day

    year, month, day = submission_start
    day -= 1
    if day == 0:
        month -= 1
        if month == 0:
            month = 12
            year -= 1
        day = days_in_month[month]
    end_year, end_month, end_day = year, month, day

print(f"Analyzing prediction period: {start_year}-{start_month}-{start_day} to {end_year}-{end_month}-{end_day}")

historical_data['date_num'] = historical_data['Year'] * 10000 + historical_data['Month'] * 100 + historical_data['Day']
predictions['date_num'] = predictions['Year'] * 10000 + predictions['Month'] * 100 + predictions['Day']

start_date_num = start_year * 10000 + start_month * 100 + start_day
end_date_num = end_year * 10000 + end_month * 100 + end_day

actual_in_pred_period = historical_data[
    (historical_data['date_num'] >= start_date_num) &
    (historical_data['date_num'] <= end_date_num)
]

if len(actual_in_pred_period) == 0:
    print("Warning: No actual data available for the prediction period.")
    print("Cannot calculate performance metrics without actual data.")
else:
    print(f"Found {len(actual_in_pred_period)} actual data points in the prediction period.")
    print("\nCalculating overall metrics for all parameters and ALL kingdoms...")
    metrics_data = []

    all_kingdoms = sorted(predictions['kingdom'].unique())
    print(f"Calculating metrics for all {len(all_kingdoms)} kingdoms")

    for kingdom in all_kingdoms:
        kingdom_pred = predictions[predictions['kingdom'] == kingdom].sort_values('date_num')
        kingdom_actual = actual_in_pred_period[actual_in_pred_period['kingdom'] == kingdom].sort_values('date_num')

        if len(kingdom_actual) > 0:
            for param in OUTPUT_FEATURES:
                merged_data = pd.merge(
                    kingdom_actual[['date_num', param]],
                    kingdom_pred[['date_num', param]],
                    on='date_num',
                    suffixes=('_actual', '_pred')
                )

                if len(merged_data) > 0:
                    rmse = np.sqrt(mean_squared_error(
                        merged_data[f'{param}_actual'],
                        merged_data[f'{param}_pred']
                    ))

                    y_true = merged_data[f'{param}_actual'].values
                    y_pred = merged_data[f'{param}_pred'].values
                    smape_value = smape(y_true, y_pred)

                    metrics_data.append({
                        'Kingdom': kingdom,
                        'Parameter': param,
                        'RMSE': rmse,
                        'SMAPE': smape_value,
                        'Points': len(merged_data)
                    })

    if metrics_data:
        metrics_df = pd.DataFrame(metrics_data)
        metrics_file = f'prediction_metrics_all_{mode_title.lower()}.csv'
        metrics_df.to_csv(metrics_file, index=False)
        print(f"Detailed metrics saved to {metrics_file}")

        plt.figure(figsize=(12, 8))
        plt.axis('off')

        summary = metrics_df.groupby('Parameter').agg({
            'RMSE': ['mean', 'min', 'max', 'std'],
            'SMAPE': ['mean', 'min', 'max', 'std'],
            'Points': 'sum'
        })

        summary.columns = [f'{col[0]}_{col[1]}' for col in summary.columns]
        summary = summary.reset_index()

        table_data = []
        table_data.append(['Parameter', 'Avg RMSE', 'Min RMSE', 'Max RMSE', 'Std RMSE',
                          'Avg SMAPE(%)', 'Min SMAPE(%)', 'Max SMAPE(%)', 'Std SMAPE(%)', 'Data Points'])

        for _, row in summary.iterrows():
            table_data.append([
                row['Parameter'],
                f"{row['RMSE_mean']:.2f}",
                f"{row['RMSE_min']:.2f}",
                f"{row['RMSE_max']:.2f}",
                f"{row['RMSE_std']:.2f}",
                f"{row['SMAPE_mean']:.2f}%",
                f"{row['SMAPE_min']:.2f}%",
                f"{row['SMAPE_max']:.2f}%",
                f"{row['SMAPE_std']:.2f}%",
                f"{int(row['Points_sum'])}"
            ])

        table = plt.table(
            cellText=table_data,
            colWidths=[0.12, 0.09, 0.09, 0.09, 0.09, 0.09, 0.09, 0.09, 0.09, 0.09],
            loc='center',
            cellLoc='center'
        )

        table.auto_set_font_size(False)
        table.set_fontsize(9)
        table.scale(1, 1.5)

        plt.title(f"Prediction Performance Metrics - ALL Kingdoms - {mode_title} Period", fontsize=16)
        plt.tight_layout()

        summary_file = f'prediction_metrics_summary_{mode_title.lower()}.png'
        plt.savefig(summary_file, bbox_inches='tight', dpi=300)
        plt.close()
        print(f"Summary metrics visualization saved to {summary_file}")

        overall_rmse = metrics_df['RMSE'].mean()
        overall_smape = metrics_df['SMAPE'].mean()

        with open(f'overall_metrics_{mode_title.lower()}.txt', 'w') as f:
            f.write(f"Overall Metrics for {mode_title} Period\n")
            f.write(f"Total Kingdoms: {len(all_kingdoms)}\n")
            f.write(f"Total Weather Parameters: {len(metrics_df['Parameter'].unique())}\n")
            f.write(f"Total Data Points: {metrics_df['Points'].sum()}\n")
            f.write(f"Overall Average RMSE: {overall_rmse:.4f}\n")
            f.write(f"Overall Average SMAPE: {overall_smape:.4f}%\n")

        print(f"Overall RMSE across all parameters: {overall_rmse:.4f}")
        print(f"Overall SMAPE across all parameters: {overall_smape:.4f}%")
    else:
        print("No metrics data available to create summary table.")

print("Performance evaluation complete!")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  Prediction output shape: (1, 8, 5)
  Last date in working data: (np.int64(8), np.int64(9), np.int64(9))
  Generated 8 prediction dates from (np.int64(8), np.int64(9), np.int64(10)) to (np.int64(8), np.int64(9), np.int64(17))
  Remaining predictions needed: 113
  Added 8 new predictions
  Total predictions so far: 48 / 153
Prediction round 7 for Dorne
  Input sequence covers: (8, 7, 1) to (8, 9, 31)
  Input sequence shape: (1, 60, 11)
  Prediction output shape: (1, 8, 5)
  Last date in working data: (np.int64(8), np.int64(9), np.int64(17))
  Generated 8 prediction dates from (np.int64(8), np.int64(9), np.int64(18)) to (np.int64(8), np.int64(9), np.int64(25))
  Remaining predictions needed: 105
  Added 8 new predictions
  Total predictions so far: 56 / 153
Prediction round 8 for Dorne
  Input sequence covers: (8, 7, 1) to (8, 9, 31)
  Input sequence shape: (1, 60, 11)
  Prediction output shape: (1, 8, 5)
  Last date in wo

In [36]:
# Post-processing functions
try:
    predictions = pd.read_csv('kingdom_weather_predictions_testing.csv')
    results = predictions.sort_values(by=['Month', 'Day', 'kingdom_ID'])

    # Check negative values
    for col in results.columns:
        if pd.api.types.is_numeric_dtype(results[col]):
            negative_count = (results[col] < 0).sum()
            print(f"Column '{col}': {negative_count} negative values")

    # Make negative Rain_Amount values equal to 0
    results['Rain_Amount'] = results['Rain_Amount'].apply(lambda x: 0 if x < 0 else x)

    # Count zero values
    for col in results.columns:
        if pd.api.types.is_numeric_dtype(results[col]):
            zero_count = (results[col] == 0).sum()
            print(f"Column '{col}': {zero_count} zero values")

    results.to_csv('results_v2_updated.csv', index=False)
except FileNotFoundError:
    print("Warning: 'kingdom_weather_predictions_year.csv' not found. Skipping post-processing.")

Column 'kingdom_ID': 0 negative values
Column 'Year': 0 negative values
Column 'Month': 0 negative values
Column 'Day': 0 negative values
Column 'latitude': 0 negative values
Column 'longitude': 4590 negative values
Column 'Avg_Temperature': 0 negative values
Column 'Radiation': 350 negative values
Column 'Rain_Amount': 1402 negative values
Column 'Wind_Speed': 2 negative values
Column 'Wind_Direction': 7 negative values
Column 'kingdom_ID': 0 zero values
Column 'Year': 0 zero values
Column 'Month': 0 zero values
Column 'Day': 0 zero values
Column 'latitude': 0 zero values
Column 'longitude': 0 zero values
Column 'Avg_Temperature': 0 zero values
Column 'Radiation': 0 zero values
Column 'Rain_Amount': 1402 zero values
Column 'Wind_Speed': 0 zero values
Column 'Wind_Direction': 0 zero values
