<a href="https://colab.research.google.com/github/Savith-02/notebooks/blob/main/adding_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [57]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, RepeatVector
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

def setup_gpu():
    physical_devices = tf.config.list_physical_devices('GPU')
    if physical_devices:
        print(f"Found {len(physical_devices)} GPU(s):")
        for i, device in enumerate(physical_devices):
            print(f"  GPU {i}: {device.name}")

        try:
            for device in physical_devices:
                tf.config.experimental.set_memory_growth(device, True)
            print("GPU memory growth enabled")

            tf.keras.mixed_precision.set_global_policy('mixed_float16')
            print("Mixed precision policy set to mixed_float16")

            return True
        except Exception as e:
            print(f"Error configuring GPU: {str(e)}")
            return False
    else:
        print("No GPU found. Using CPU.")
        return False

is_using_gpu = setup_gpu()

SEED = 42
def set_seeds(seed_value):
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)
    tf.random.set_seed(seed_value)

    if tf.config.list_physical_devices('GPU'):
        try:
            os.environ['TF_DETERMINISTIC_OPS'] = '1'
            os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
            tf.config.experimental.enable_op_determinism()
        except Exception as e:
            print(f"Warning: Could not enable deterministic operations in TensorFlow: {e}")

set_seeds(SEED)
print(f"Seed set to {SEED}")

url = 'https://drive.google.com/uc?id=1DsKaiLMxsT-VHKwqm4vvtANCIjadi1-r'
df = pd.read_csv(url)

Found 1 GPU(s):
  GPU 0: /physical_device:GPU:0
GPU memory growth enabled
Mixed precision policy set to mixed_float16
Seed set to 42


In [58]:
def add_kingdom_id(df):
    unique_kingdoms = df['kingdom'].unique()
    kingdom_to_id = {kingdom: i + 1 for i, kingdom in enumerate(unique_kingdoms)}
    df['kingdom_ID'] = df['kingdom'].map(kingdom_to_id)
    return df

df = add_kingdom_id(df)

In [59]:
# Configuration parameters
EPOCHS = 55
BATCH_SIZE = 32
SEQ_LENGTH = 60
PRED_LENGTH = 8


In [66]:

def prepare_data(df):
    end_year, end_month, end_day = map(int, TRAINING_DATE_END.split('-'))
    df = df.dropna(subset=['kingdom'])

    numeric_columns = ['Avg_Temperature','Radiation', 'Rain_Amount',
                      'Wind_Speed', 'Wind_Direction',
                       'latitude', 'longitude', 'Year', 'Month', 'Day']

    for col in numeric_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    for col in numeric_columns:
        if col in df.columns and df[col].isna().any():
            if col in ['Year', 'Month', 'Day']:
                df = df.dropna(subset=[col])
            else:
                median_val = df[col].median()
                df[col] = df[col].fillna(median_val)
                print(f"Filled NaN values in {col} with median: {median_val}")

    date_mask = ((df['Year'] < end_year) |
                ((df['Year'] == end_year) & (df['Month'] < end_month)) |
                ((df['Year'] == end_year) & (df['Month'] == end_month) & (df['Day'] <= end_day)))

    df = df[date_mask]
    print(f"Data filtered until date: Year {end_year}, Month {end_month}, Day {end_day}")

    kingdoms = sorted(df['kingdom'].unique())[:NUM_KINGDOMS]
    kingdom_to_id = {kingdom: idx + 1 for idx, kingdom in enumerate(kingdoms)}

    df = df[df['kingdom'].isin(kingdoms)]
    df['kingdom_ID'] = df['kingdom'].map(kingdom_to_id)

    kingdom_info = {}
    for kingdom in kingdoms:
        k_data = df[df['kingdom'] == kingdom].iloc[0]
        kingdom_info[kingdom] = {
            'id': int(k_data['kingdom_ID']),
            'latitude': float(k_data['latitude']),
            'longitude': float(k_data['longitude'])
        }

    print(f"Found {len(kingdoms)} kingdoms in the data")
    return df, kingdoms, kingdom_info

def scale_features(data, features):
    scalers = {}
    scaled_data = data.copy()
    non_scaled_features = ['Year', 'Month', 'Day', 'kingdom_ID']

    for feature in features:
        if feature in non_scaled_features:
            scalers[feature] = None
            continue

        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_data[feature] = scaler.fit_transform(data[feature].values.reshape(-1, 1))
        scalers[feature] = scaler

    return scaled_data, scalers

def create_sequences_optimized(data, input_features, output_features, seq_length=SEQ_LENGTH, pred_length=PRED_LENGTH):
    sequences = []
    targets = []

    for kingdom_id in data['kingdom_ID'].unique():
        kingdom_data = data[data['kingdom_ID'] == kingdom_id].copy()
        kingdom_data.loc[:, 'date_tuple'] = list(zip(kingdom_data['Year'], kingdom_data['Month'], kingdom_data['Day']))
        unique_dates = sorted(kingdom_data['date_tuple'].unique())

        if len(unique_dates) < seq_length + pred_length:
            print(f"Not enough data for kingdom ID {kingdom_id}")
            continue

        date_to_row = {}
        for _, row in kingdom_data.iterrows():
            date_tuple = (row['Year'], row['Month'], row['Day'])
            date_to_row[date_tuple] = row

        for i in range(0, len(unique_dates) - seq_length - pred_length + 1, 8):
            seq_dates = unique_dates[i:i+seq_length]
            target_dates = unique_dates[i+seq_length:i+seq_length+pred_length]

            seq_data = []
            target_data = []

            all_dates_found = True
            for date in seq_dates:
                if date in date_to_row:
                    seq_data.append(date_to_row[date][input_features].values)
                else:
                    all_dates_found = False
                    break

            if not all_dates_found:
                continue

            all_dates_found = True
            for date in target_dates:
                if date in date_to_row:
                    target_data.append(date_to_row[date][output_features].values)
                else:
                    all_dates_found = False
                    break

            if all_dates_found and len(seq_data) == seq_length and len(target_data) == pred_length:
                sequences.append(np.array(seq_data))
                targets.append(np.array(target_data))

    sequences_array = np.array(sequences, dtype=np.float32)
    targets_array = np.array(targets, dtype=np.float32)

    print(f"Created {len(sequences)} sequence-target pairs")
    return sequences_array, targets_array


def build_model(input_shape, output_timesteps, num_features, lr=0.001):
    """
    Build an improved LSTM model specifically tuned for weather forecasting.
    This maintains the same code structure while enhancing performance.
    """
    tf.keras.utils.set_random_seed(SEED)

    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=input_shape),

        # First LSTM layer with slightly increased capacity
        tf.keras.layers.LSTM(
            160,  # Slightly increased units (was 128)
            return_sequences=False,
            kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
            kernel_regularizer=tf.keras.regularizers.l2(1e-6)  # Very light regularization
        ),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),

        # Repeat vector for output sequence generation
        tf.keras.layers.RepeatVector(output_timesteps),

        # Second LSTM layer with BatchNorm and Dropout
        tf.keras.layers.LSTM(
            160,  # Increased units (was 128)
            return_sequences=True,
            kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
            kernel_regularizer=tf.keras.regularizers.l2(1e-6)  # Very light regularization
        ),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),

        # Third LSTM layer for further refinement
        tf.keras.layers.LSTM(
            80,  # Increased units (was 64)
            return_sequences=True,
            kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
            recurrent_dropout=0.1  # Add recurrent dropout to reduce overfitting
        ),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),

        # Output layer with activation suitable for weather data (always positive)
        tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_features))
    ])

    # Improved learning rate schedule for weather forecasting
    # - Starts with higher learning rate for faster initial convergence
    # - Gradually decreases to fine-tune predictions
    lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
        initial_learning_rate=lr * 1.5,  # Start with higher learning rate
        decay_steps=1500,                # Gradual decay
        alpha=0.05                       # Don't let it get too small
    )

    optimizer = tf.keras.optimizers.Adam(
        learning_rate=lr_schedule,
        clipnorm=1.0  # Add gradient clipping to improve stability
    )

    model.compile(
        optimizer=optimizer,
        loss='mean_squared_error'
    )

    return model


# def train_global_model(X, y, scalers, patience=8, val_ratio=0.2):
#     print("\n===== Training Global Model for All Kingdoms =====")

#     if X.size == 0 or y.size == 0:
#         print("Error: No training sequences could be created.")
#         return None, None

#     # Use sklearn's built-in train_test_split with shuffle=False for time series data
#     from sklearn.model_selection import train_test_split
#     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_ratio, shuffle=False)
#     print(f"Training: {X_train.shape}, Validation: {X_val.shape}")

#     # Build model
#     input_shape = (X_train.shape[1], X_train.shape[2])
#     model = build_model(input_shape, y_train.shape[1], y_train.shape[2])
#     model.summary()

#     # Create callbacks
#     callbacks = [
#         # Early stopping
#         tf.keras.callbacks.EarlyStopping(
#             monitor='val_loss',
#             patience=patience,
#             restore_best_weights=True,
#             verbose=1,
#             min_delta=0.0001
#         ),
#             # Model checkpointing - save best model
#             tf.keras.callbacks.ModelCheckpoint(
#             filepath='best_model.keras',  # use filepath instead
#             monitor='val_loss',
#             save_best_only=True,
#             verbose=1
#         )
#     ]

#     # Train with validation
#     batch_size = BATCH_SIZE * 2 if is_using_gpu else BATCH_SIZE
#     print(f"Using batch size: {batch_size}")

#     history = model.fit(
#         X_train, y_train,
#         validation_data=(X_val, y_val),
#         epochs=EPOCHS,
#         batch_size=batch_size,
#         callbacks=callbacks,
#         verbose=1
#     )

#     # Evaluate final model
#     print("\n===== Final Model Evaluation =====")
#     train_preds = model.predict(X_train, verbose=0)
#     val_preds = model.predict(X_val, verbose=0)

#     train_mse, train_smape, _, _ = evaluate_predictions(y_train, train_preds, scalers)
#     val_mse, val_smape, _, _ = evaluate_predictions(y_val, val_preds, scalers)

#     print(f"Final MSE  - Train: {train_mse:.4f}, Validation: {val_mse:.4f}")
#     print(f"Final SMAPE - Train: {train_smape:.2f}%, Validation: {val_smape:.2f}%")

#     # Plot metrics
#     plt.figure(figsize=(15, 5))

#     # Plot loss curves
#     plt.subplot(1, 3, 1)
#     plt.plot(history.history['loss'], 'b-', label='Train')
#     plt.plot(history.history['val_loss'], 'r-', label='Validation')
#     plt.title('Model Loss')
#     plt.xlabel('Epoch')
#     plt.ylabel('Loss')
#     plt.grid(True, alpha=0.3)
#     plt.legend()

#     # Plot learning rate if available
#     if 'lr' in history.history:
#         plt.subplot(1, 3, 2)
#         plt.plot(history.history['lr'], 'g-', label='Learning Rate')
#         plt.title('Learning Rate')
#         plt.xlabel('Epoch')
#         plt.ylabel('Learning Rate')
#         plt.grid(True, alpha=0.3)
#         plt.legend()

#     # Save the trained model
#     model.save('kingdom_weather_model.keras')
#     print("Model saved to kingdom_weather_model.keras in Keras format")

#     # Visualize SMAPE on feature-by-feature basis for validation set
#     features_to_plot = min(5, y_val.shape[2])
#     fig, axes = plt.subplots(1, features_to_plot, figsize=(15, 4))

#     for i in range(features_to_plot):
#         feature_name = OUTPUT_FEATURES[i] if i < len(OUTPUT_FEATURES) else f"Feature {i}"

#         # Get feature-specific values
#         y_true = y_val[:, :, i].flatten()
#         y_pred = val_preds[:, :, i].flatten()

#         # Inverse transform if scaler exists
#         if OUTPUT_FEATURES[i] in scalers and scalers[OUTPUT_FEATURES[i]] is not None:
#             scaler = scalers[OUTPUT_FEATURES[i]]
#             y_true = scaler.inverse_transform(y_true.reshape(-1, 1)).flatten()
#             y_pred = scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()

#         # Calculate feature-specific SMAPE
#         feature_smape = smape(y_true, y_pred)

#         # Plot actual vs predicted for this feature (sample points)
#         max_points = 100  # Limit points to avoid overcrowding
#         indices = np.linspace(0, len(y_true)-1, min(max_points, len(y_true))).astype(int)

#         if features_to_plot > 1:
#             ax = axes[i]
#         else:
#             ax = axes

#         ax.plot(y_true[indices], label='Actual', marker='o', markersize=4, linestyle='', alpha=0.7)
#         ax.plot(y_pred[indices], label='Predicted', marker='x', markersize=4, linestyle='', alpha=0.7)
#         ax.set_title(f"{feature_name}\nSMAPE: {feature_smape:.2f}%")
#         ax.grid(True, alpha=0.3)

#         if i == 0:  # Only add legend to first subplot
#             ax.legend()

#     plt.tight_layout()
#     plt.savefig('model_evaluation.png', dpi=300)
#     plt.close()

#     return model, history
def train_global_model(X, y, scalers, patience=7):
    print("\n===== Training Global Model for All Kingdoms =====")

    if X.size == 0 or y.size == 0:
        print("Error: No training sequences could be created.")
        return None, None

    # Use all available data for training (no validation split)
    print(f"Using full dataset for training: {X.shape}")

    # Build model
    input_shape = (X.shape[1], X.shape[2])
    model = build_model(input_shape, y.shape[1], y.shape[2])
    model.summary()

    # Create callbacks
    callbacks = [
        # No early stopping based on validation data
        # Just use ModelCheckpoint to save the final model
        tf.keras.callbacks.ModelCheckpoint(
            filepath='kingdom_weather_model.keras',
            save_best_only=False,  # Save the final model
            verbose=1
        )
    ]

    # Train with all data
    batch_size = BATCH_SIZE * 2 if is_using_gpu else BATCH_SIZE
    print(f"Using batch size: {batch_size}")

    history = model.fit(
        X, y,
        epochs=EPOCHS,  # Use a fixed number of epochs
        batch_size=batch_size,
        callbacks=callbacks,
        verbose=1
    )

    # Evaluate final model
    print("\n===== Final Model Evaluation =====")
    train_preds = model.predict(X, verbose=0)
    train_mse, train_smape, _, _ = evaluate_predictions(y, train_preds, scalers)

    print(f"Final MSE  - Train: {train_mse:.4f}")
    print(f"Final SMAPE - Train: {train_smape:.2f}%")

    # Save the trained model
    model.save('kingdom_weather_model.keras')
    print("Model saved to kingdom_weather_model.keras in Keras format")

    return model, history

def evaluate_predictions(y_true, y_pred, scalers):
    if y_true.size == 0 or y_pred.size == 0:
        return float('nan'), float('nan'), None, None

    y_pred_unscaled = np.zeros_like(y_pred)
    y_true_unscaled = np.zeros_like(y_true)

    for i, feature in enumerate(OUTPUT_FEATURES):
        scaler = scalers[feature]
        if scaler is not None:
            y_pred_unscaled[:, :, i] = scaler.inverse_transform(y_pred[:, :, i])
            y_true_unscaled[:, :, i] = scaler.inverse_transform(y_true[:, :, i])
        else:
            y_pred_unscaled[:, :, i] = y_pred[:, :, i]
            y_true_unscaled[:, :, i] = y_true[:, :, i]

    mse = mean_squared_error(y_true_unscaled.flatten(), y_pred_unscaled.flatten())
    smape_val = smape(y_true_unscaled.flatten(), y_pred_unscaled.flatten())

    return mse, smape_val, y_true_unscaled, y_pred_unscaled


In [67]:
# Function to add the engineered features
def add_engineered_features(df):
    # Sort by kingdom and date to ensure correct calculation
    df = df.sort_values(by=['kingdom_ID', 'Year', 'Month', 'Day']).copy()

    # Group by kingdom for feature engineering
    grouped = df.groupby('kingdom_ID')

    # Features to apply engineering to
    base_features = ['Avg_Temperature', 'Radiation', 'Rain_Amount', 'Wind_Speed', 'Wind_Direction']

    # Add rolling min/max for different windows
    windows = [3, 7, 14]
    for window in windows:
        for feature in base_features:
            min_col = f'{feature}_rollmin_{window}'
            max_col = f'{feature}_rollmax_{window}'
            df[min_col] = grouped[feature].transform(lambda x: x.rolling(window, min_periods=1).min())
            df[max_col] = grouped[feature].transform(lambda x: x.rolling(window, min_periods=1).max())

    # Fill any NaN values that might have been created
    for col in df.columns:
        if df[col].isna().any():
            # First try backfill
            df[col] = df[col].fillna(method='bfill')
            # Then try forward fill
            df[col] = df[col].fillna(method='ffill')
            # If still NaN, use median
            if df[col].isna().any():
                df[col] = df[col].fillna(df[col].median())

    return df

# Modified recursive_forecast to handle engineered features for prediction
def recursive_forecast_with_features(model, data, kingdom, start_date, num_predictions, kingdom_info, scalers, seq_length=SEQ_LENGTH, pred_length=None):
    print(f"\n===== Starting Recursive Forecast for {kingdom} =====")
    print(f"Requested {num_predictions} total days of predictions")

    k_id = kingdom_info[kingdom]['id']
    lat = kingdom_info[kingdom]['latitude']
    lon = kingdom_info[kingdom]['longitude']

    kingdom_data = data[data['kingdom'] == kingdom].sort_values(by=['Year', 'Month', 'Day'])
    print(f"Found {len(kingdom_data)} records for {kingdom}")

    if len(kingdom_data) == 0:
        print(f"Error: No data found for kingdom '{kingdom}' (ID={k_id})")
        return pd.DataFrame()

    min_date = (int(kingdom_data['Year'].min()), int(kingdom_data['Month'].min()), int(kingdom_data['Day'].min()))
    max_date = (int(kingdom_data['Year'].max()), int(kingdom_data['Month'].max()), int(kingdom_data['Day'].max()))
    print(f"Date range for {kingdom}: {min_date} to {max_date}")
    print(f"Attempting to start predictions from: {start_date}")

    start_year, start_month, start_day = start_date

    filtered_data = kingdom_data[
        (kingdom_data['Year'] == start_year) &
        (kingdom_data['Month'] == start_month) &
        (kingdom_data['Day'] == start_day)
    ]

    if len(filtered_data) == 0:
        print(f"Error: Start date {start_date} not found for {kingdom}. Checking nearby dates...")

        dates_in_month = kingdom_data[
            (kingdom_data['Year'] == start_year) &
            (kingdom_data['Month'] == start_month)
        ]

        if len(dates_in_month) > 0:
            closest_day = dates_in_month['Day'].iloc[0]
            for day in dates_in_month['Day']:
                if abs(day - start_day) < abs(closest_day - start_day):
                    closest_day = day

            print(f"Found closest day: {start_year}-{start_month}-{closest_day}")
            start_day = closest_day
            start_date = (start_year, start_month, start_day)

            filtered_data = kingdom_data[
                (kingdom_data['Year'] == start_year) &
                (kingdom_data['Month'] == start_month) &
                (kingdom_data['Day'] == start_day)
            ]
        else:
            print(f"No dates found in month {start_year}-{start_month} for {kingdom}")
            return pd.DataFrame()

    if len(filtered_data) == 0:
        print(f"Error: Still could not find suitable start date for {kingdom}")
        return pd.DataFrame()

    start_idx = filtered_data.index[0]
    all_indices = kingdom_data.index.tolist()

    try:
        position = all_indices.index(start_idx)
    except ValueError:
        print(f"Error: Could not find start index in kingdom data indices")
        return pd.DataFrame()

    if position < seq_length - 1:
        print(f"Warning: Not enough history before {start_date} for {kingdom}. Using available data.")
        initial_data = kingdom_data.iloc[:position+1]
    else:
        initial_data = kingdom_data.iloc[position-seq_length+1:position+1]

    print(f"Initial sequence has {len(initial_data)} days of data")

    if len(initial_data) < 1:
        print(f"Error: No data available for {kingdom}. Skipping.")
        return pd.DataFrame()

    results = []
    working_data = initial_data.copy()

    round_idx = 0
    while len(results) < num_predictions:
        round_idx += 1
        print(f"Prediction round {round_idx} for {kingdom}")

        # Ensure working_data has all the engineered features
        working_data = add_engineered_features(working_data)

        latest_data = working_data.tail(seq_length)

        if len(latest_data) < seq_length:
            print(f"Warning: Using {len(latest_data)} days instead of {seq_length} days for sequence.")

        latest_min_date = (
            int(latest_data['Year'].min()),
            int(latest_data['Month'].min()),
            int(latest_data['Day'].min())
        )
        latest_max_date = (
            int(latest_data['Year'].max()),
            int(latest_data['Month'].max()),
            int(latest_data['Day'].max())
        )
        print(f"  Input sequence covers: {latest_min_date} to {latest_max_date}")

        X_seq = []
        for _, row in latest_data.iterrows():
            scaled_row = row.copy()

            for feature in FINAL_INPUT_FEATURES:
                if feature not in ['Year', 'Month', 'Day', 'kingdom_ID']:
                    if scalers[feature] is not None:
                        scaled_row[feature] = scalers[feature].transform([[row[feature]]])[0][0]

            X_seq.append(scaled_row[FINAL_INPUT_FEATURES].values)

        X_seq = np.array([X_seq], dtype=np.float32)
        print(f"  Input sequence shape: {X_seq.shape}")

        y_pred = model.predict(X_seq, verbose=0)
        print(f"  Prediction output shape: {y_pred.shape}")

        model_pred_length = y_pred.shape[1]
        if pred_length is None:
            pred_length = model_pred_length
            print(f"  Using model's output length: {pred_length} days")

        y_pred_unscaled = np.zeros_like(y_pred)
        for i, feature in enumerate(OUTPUT_FEATURES):
            scaler = scalers[feature]
            if scaler is not None:
                y_pred_unscaled[:, :, i] = scaler.inverse_transform(y_pred[:, :, i])
            else:
                y_pred_unscaled[:, :, i] = y_pred[:, :, i]

        last_date = (
            working_data.iloc[-1]['Year'],
            working_data.iloc[-1]['Month'],
            working_data.iloc[-1]['Day']
        )
        print(f"  Last date in working data: {last_date}")

        pred_dates = []
        current_year, current_month, current_day = last_date

        for day_idx in range(model_pred_length):
            current_day += 1

            days_in_month = pd.Timestamp(year=int(current_year), month=int(current_month), day=1).days_in_month
            if current_day > days_in_month:
                current_day = 1
                current_month += 1
                if current_month > 12:
                    current_month = 1
                    current_year += 1

            pred_dates.append((current_year, current_month, current_day))

        print(f"  Generated {len(pred_dates)} prediction dates from {pred_dates[0]} to {pred_dates[-1]}")

        remaining_preds = num_predictions - len(results)
        print(f"  Remaining predictions needed: {remaining_preds}")

        will_crop = (remaining_preds < model_pred_length)
        if will_crop:
            print(f"  Will use only the first {remaining_preds} of {model_pred_length} predictions from this batch")

        use_count = min(model_pred_length, remaining_preds)

        for i in range(use_count):
            date_tuple = pred_dates[i]
            year, month, day = date_tuple

            new_row = {
                'kingdom': kingdom,
                'kingdom_ID': k_id,
                'Year': int(year),
                'Month': int(month),
                'Day': int(day),
                'latitude': lat,
                'longitude': lon
            }

            for j, feature in enumerate(OUTPUT_FEATURES):
                new_row[feature] = y_pred_unscaled[0, i, j]

            results.append(new_row)
            working_data = pd.concat([working_data, pd.DataFrame([new_row])], ignore_index=True)

        print(f"  Added {use_count} new predictions")
        print(f"  Total predictions so far: {len(results)} / {num_predictions}")

        if len(results) >= num_predictions:
            print(f"  Reached target number of predictions ({num_predictions}). Stopping.")
            break

    results_df = pd.DataFrame(results)
    print(f"Generated {len(results_df)} predictions for {kingdom}")

    if len(results_df) > num_predictions:
        print(f"Warning: Generated {len(results_df)} predictions, but only {num_predictions} were requested.")
        print(f"Trimming to exact requested number.")
        results_df = results_df.head(num_predictions)
    elif len(results_df) < num_predictions:
        print(f"Warning: Only generated {len(results_df)} predictions, but {num_predictions} were requested.")

    if len(results_df) > 0:
        min_pred_date = (
            results_df['Year'].min(),
            results_df['Month'].min(),
            results_df['Day'].min()
        )
        max_pred_date = (
            results_df['Year'].max(),
            results_df['Month'].max(),
            results_df['Day'].max()
        )
        print(f"Predictions cover {min_pred_date} to {max_pred_date}")

    return results_df


In [74]:

TRAINING_DATE_END = ('8-12-31')
SUBMISSION_DATE_START = ('9-1-1')
SUMMISSION_DATE_END = ('9-5-31')

NUM_KINGDOMS = 30

In [75]:
# Setup GPU if available
if tf.config.list_physical_devices('GPU'):
    print("Using GPU")
    for gpu in tf.config.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)
else:
    print("Using CPU")

Using GPU


In [76]:
# Define the best features
BEST_FEATURES = [
    'Avg_Temperature', 'Radiation', 'Rain_Amount', 'Wind_Speed', 'Wind_Direction',
    'Avg_Temperature_rollmin_3', 'Avg_Temperature_rollmax_3', 'Radiation_rollmin_3',
    'Radiation_rollmax_3', 'Rain_Amount_rollmin_3', 'Rain_Amount_rollmax_3',
    'Wind_Speed_rollmin_3', 'Wind_Speed_rollmax_3', 'Wind_Direction_rollmin_3',
    'Wind_Direction_rollmax_3', 'Avg_Temperature_rollmin_7', 'Avg_Temperature_rollmax_7',
    'Radiation_rollmin_7', 'Radiation_rollmax_7', 'Rain_Amount_rollmin_7',
    'Rain_Amount_rollmax_7', 'Wind_Speed_rollmin_7', 'Wind_Speed_rollmax_7',
    'Wind_Direction_rollmin_7', 'Wind_Direction_rollmax_7', 'Avg_Temperature_rollmin_14',
    'Avg_Temperature_rollmax_14', 'Radiation_rollmin_14', 'Radiation_rollmax_14',
    'Rain_Amount_rollmin_14', 'Rain_Amount_rollmax_14', 'Wind_Speed_rollmin_14',
    'Wind_Speed_rollmax_14', 'Wind_Direction_rollmin_14', 'Wind_Direction_rollmax_14'
]

# Define the final input features
FINAL_INPUT_FEATURES = ['Year', 'Month', 'Day', 'kingdom_ID', 'latitude', 'longitude'] + BEST_FEATURES

OUTPUT_FEATURES = [
    'Avg_Temperature', 'Radiation', 'Rain_Amount', 'Wind_Speed', 'Wind_Direction'
]

In [None]:
# Main execution code for training the model with best features
print("Adding engineered features to the dataset...")
engineered_data = add_engineered_features(df)

# Prepare data with best features
print("Preparing data with best features...")
data, kingdoms, kingdom_info = prepare_data(engineered_data)
scaled_data, scalers = scale_features(data, FINAL_INPUT_FEATURES)
X_train, y_train = create_sequences_optimized(scaled_data, FINAL_INPUT_FEATURES, OUTPUT_FEATURES)

Adding engineered features to the dataset...
Preparing data with best features...
Data filtered until date: Year 8, Month 12, Day 31
Found 30 kingdoms in the data


In [None]:
# Train the final model with full dataset and original parameters
print("Training model with best features...")
model, history = train_global_model(X_train, y_train, scalers)

In [None]:
submission_mode = True

In [None]:
SUBMISSION_DATE_START = ('9-1-1')
SUMMISSION_DATE_END = ('9-1-31')

In [None]:
# Now generate predictions using the model with best features
if model is None:
    print("Error: Model training failed. Exiting.")
else:
    submission_start = tuple(map(int, str(SUBMISSION_DATE_START).split('-')))
    submission_end = tuple(map(int, str(SUMMISSION_DATE_END).split('-')))

    if isinstance(TRAINING_DATE_END, str):
        training_end = tuple(map(int, TRAINING_DATE_END.split('-')))
    else:
        training_end = TRAINING_DATE_END

    if submission_mode:
        start_date = submission_start
        end_date = submission_end
        print(f"Submission mode enabled: Predicting from {start_date} to {end_date}")
    else:
        year, month, day = training_end
        days_in_month = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

        day += 1
        if day > days_in_month[month]:
            day = 1
            month += 1
            if month > 12:
                month = 1
                year += 1

        start_date = (year, month, day)

        year, month, day = submission_start
        day -= 1
        if day == 0:
            month -= 1
            if month == 0:
                month = 12
                year -= 1
            day = days_in_month[month]

        end_date = (year, month, day)
        print(f"Testing mode: Predicting from {start_date} to {end_date}")

    def date_to_days(date_tuple):
        year, month, day = date_tuple
        days_in_month = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

        days = day
        for m in range(1, month):
            days += days_in_month[m]
        days += 365 * year

        return days

    start_days = date_to_days(start_date)
    end_days = date_to_days(end_date)
    num_days = end_days - start_days + 1

    # Convert start_date and end_date to numeric format for easier filtering
    start_date_num = start_date[0] * 10000 + start_date[1] * 100 + start_date[2]
    end_date_num = end_date[0] * 10000 + end_date[1] * 100 + end_date[2]

    print(f"Predicting {num_days} days from {start_date} to {end_date}")
    print(f"Date range in numeric format: {start_date_num} to {end_date_num}")

    all_predictions = []

    for kingdom in kingdoms:
        if submission_mode:
            last_date = training_end
        else:
            year, month, day = start_date
            day -= 1
            if day == 0:
                month -= 1
                if month == 0:
                    month = 12
                    year -= 1
                days_in_month = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
                day = days_in_month[month]

            last_date = (year, month, day)

        print(f"Using latest data point for {kingdom}: Year {last_date[0]}, Month {last_date[1]}, Day {last_date[2]}")

        # Use the new recursive forecast function with features
        kingdom_predictions = recursive_forecast_with_features(
            model, data, kingdom, last_date, num_days, kingdom_info, scalers)

        if not kingdom_predictions.empty:
            # Create a numeric date column for easier filtering
            kingdom_predictions['date_num'] = kingdom_predictions['Year'] * 10000 + kingdom_predictions['Month'] * 100 + kingdom_predictions['Day']

            # Filter predictions to match the target date range
            filtered_predictions = kingdom_predictions[
                (kingdom_predictions['date_num'] >= start_date_num) &
                (kingdom_predictions['date_num'] <= end_date_num)
            ]

            print(f"Generated {len(kingdom_predictions)} predictions for {kingdom}")
            print(f"After filtering for target date range: {len(filtered_predictions)} predictions")

            # Drop the temporary date_num column before saving
            filtered_predictions = filtered_predictions.drop(columns=['date_num'])

            all_predictions.append(filtered_predictions)
        else:
            print(f"No predictions generated for {kingdom}")

    if all_predictions:
        final_predictions = pd.concat(all_predictions, ignore_index=True)

        # Verify we have predictions
        print(f"Total predictions: {len(final_predictions)}")
        if len(final_predictions) == 0:
            print("WARNING: No predictions survived filtering! Check your date ranges and filtering logic.")
        else:
            # Print date range of predictions
            min_date = (
                int(final_predictions['Year'].min()),
                int(final_predictions['Month'].min()),
                int(final_predictions['Day'].min())
            )
            max_date = (
                int(final_predictions['Year'].max()),
                int(final_predictions['Month'].max()),
                int(final_predictions['Day'].max())
            )
            print(f"Final predictions cover date range: {min_date} to {max_date}")

            # Save the predictions
            output_predictions_path = "kingdom_weather_predictions_submission.csv" if submission_mode else "kingdom_weather_predictions_testing.csv"
            print(f"Saving predictions to {output_predictions_path}")
            final_predictions.to_csv(output_predictions_path, index=False)
            print("Prediction process completed!")
    else:
        print("No predictions were generated for any kingdom.")

In [71]:

# Evaluation phase
if submission_mode:
    predictions_file = 'kingdom_weather_predictions_submission.csv'
    mode_title = "Submission"
else:
    predictions_file = 'kingdom_weather_predictions_testing.csv'
    mode_title = "Testing"

try:
    predictions = pd.read_csv(predictions_file)
    print(f"Loaded {len(predictions)} prediction records")
except FileNotFoundError:
    print(f"Error: {predictions_file} not found.")
    exit(1)

historical_data = df

if submission_mode:
    start_year, start_month, start_day = map(int, SUBMISSION_DATE_START.split('-'))
    end_year, end_month, end_day = map(int, SUMMISSION_DATE_END.split('-'))
else:
    training_end = tuple(map(int, TRAINING_DATE_END.split('-')))
    submission_start = tuple(map(int, SUBMISSION_DATE_START.split('-')))

    year, month, day = training_end
    days_in_month = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    day += 1
    if day > days_in_month[month]:
        day = 1
        month += 1
        if month > 12:
            month = 1
            year += 1
    start_year, start_month, start_day = year, month, day

    year, month, day = submission_start
    day -= 1
    if day == 0:
        month -= 1
        if month == 0:
            month = 12
            year -= 1
        day = days_in_month[month]
    end_year, end_month, end_day = year, month, day

print(f"Analyzing prediction period: {start_year}-{start_month}-{start_day} to {end_year}-{end_month}-{end_day}")

historical_data['date_num'] = historical_data['Year'] * 10000 + historical_data['Month'] * 100 + historical_data['Day']
predictions['date_num'] = predictions['Year'] * 10000 + predictions['Month'] * 100 + predictions['Day']

start_date_num = start_year * 10000 + start_month * 100 + start_day
end_date_num = end_year * 10000 + end_month * 100 + end_day

actual_in_pred_period = historical_data[
    (historical_data['date_num'] >= start_date_num) &
    (historical_data['date_num'] <= end_date_num)
]

if len(actual_in_pred_period) == 0:
    print("Warning: No actual data available for the prediction period.")
    print("Cannot calculate performance metrics without actual data.")
else:
    print(f"Found {len(actual_in_pred_period)} actual data points in the prediction period.")
    print("\nCalculating overall metrics for all parameters and ALL kingdoms...")
    metrics_data = []

    all_kingdoms = sorted(predictions['kingdom'].unique())
    print(f"Calculating metrics for all {len(all_kingdoms)} kingdoms")

    for kingdom in all_kingdoms:
        kingdom_pred = predictions[predictions['kingdom'] == kingdom].sort_values('date_num')
        kingdom_actual = actual_in_pred_period[actual_in_pred_period['kingdom'] == kingdom].sort_values('date_num')

        if len(kingdom_actual) > 0:
            for param in OUTPUT_FEATURES:
                merged_data = pd.merge(
                    kingdom_actual[['date_num', param]],
                    kingdom_pred[['date_num', param]],
                    on='date_num',
                    suffixes=('_actual', '_pred')
                )

                if len(merged_data) > 0:
                    rmse = np.sqrt(mean_squared_error(
                        merged_data[f'{param}_actual'],
                        merged_data[f'{param}_pred']
                    ))

                    y_true = merged_data[f'{param}_actual'].values
                    y_pred = merged_data[f'{param}_pred'].values
                    smape_value = smape(y_true, y_pred)

                    metrics_data.append({
                        'Kingdom': kingdom,
                        'Parameter': param,
                        'RMSE': rmse,
                        'SMAPE': smape_value,
                        'Points': len(merged_data)
                    })

    if metrics_data:
        metrics_df = pd.DataFrame(metrics_data)
        metrics_file = f'prediction_metrics_all_{mode_title.lower()}.csv'
        metrics_df.to_csv(metrics_file, index=False)
        print(f"Detailed metrics saved to {metrics_file}")

        plt.figure(figsize=(12, 8))
        plt.axis('off')

        summary = metrics_df.groupby('Parameter').agg({
            'RMSE': ['mean', 'min', 'max', 'std'],
            'SMAPE': ['mean', 'min', 'max', 'std'],
            'Points': 'sum'
        })

        summary.columns = [f'{col[0]}_{col[1]}' for col in summary.columns]
        summary = summary.reset_index()

        table_data = []
        table_data.append(['Parameter', 'Avg RMSE', 'Min RMSE', 'Max RMSE', 'Std RMSE',
                          'Avg SMAPE(%)', 'Min SMAPE(%)', 'Max SMAPE(%)', 'Std SMAPE(%)', 'Data Points'])

        for _, row in summary.iterrows():
            table_data.append([
                row['Parameter'],
                f"{row['RMSE_mean']:.2f}",
                f"{row['RMSE_min']:.2f}",
                f"{row['RMSE_max']:.2f}",
                f"{row['RMSE_std']:.2f}",
                f"{row['SMAPE_mean']:.2f}%",
                f"{row['SMAPE_min']:.2f}%",
                f"{row['SMAPE_max']:.2f}%",
                f"{row['SMAPE_std']:.2f}%",
                f"{int(row['Points_sum'])}"
            ])

        table = plt.table(
            cellText=table_data,
            colWidths=[0.12, 0.09, 0.09, 0.09, 0.09, 0.09, 0.09, 0.09, 0.09, 0.09],
            loc='center',
            cellLoc='center'
        )

        table.auto_set_font_size(False)
        table.set_fontsize(9)
        table.scale(1, 1.5)

        plt.title(f"Prediction Performance Metrics - ALL Kingdoms - {mode_title} Period", fontsize=16)
        plt.tight_layout()

        summary_file = f'prediction_metrics_summary_{mode_title.lower()}.png'
        plt.savefig(summary_file, bbox_inches='tight', dpi=300)
        plt.close()
        print(f"Summary metrics visualization saved to {summary_file}")

        overall_rmse = metrics_df['RMSE'].mean()
        overall_smape = metrics_df['SMAPE'].mean()

        with open(f'overall_metrics_{mode_title.lower()}.txt', 'w') as f:
            f.write(f"Overall Metrics for {mode_title} Period\n")
            f.write(f"Total Kingdoms: {len(all_kingdoms)}\n")
            f.write(f"Total Weather Parameters: {len(metrics_df['Parameter'].unique())}\n")
            f.write(f"Total Data Points: {metrics_df['Points'].sum()}\n")
            f.write(f"Overall Average RMSE: {overall_rmse:.4f}\n")
            f.write(f"Overall Average SMAPE: {overall_smape:.4f}%\n")

        print(f"Overall RMSE across all parameters: {overall_rmse:.4f}")
        print(f"Overall SMAPE across all parameters: {overall_smape:.4f}%")
    else:
        print("No metrics data available to create summary table.")

print("Performance evaluation complete!")


Loaded 0 prediction records
Analyzing prediction period: 9-1-1 to 9-5-31
Cannot calculate performance metrics without actual data.
Performance evaluation complete!


In [None]:
# Post-processing functions
try:
    predictions = pd.read_csv('kingdom_weather_predictions_testing.csv')
    results = predictions.sort_values(by=['Month', 'Day', 'kingdom_ID'])

    # Check negative values
    for col in results.columns:
        if pd.api.types.is_numeric_dtype(results[col]):
            negative_count = (results[col] < 0).sum()
            print(f"Column '{col}': {negative_count} negative values")

    # Make negative Rain_Amount values equal to 0
    results['Rain_Amount'] = results['Rain_Amount'].apply(lambda x: 0 if x < 0 else x)

    # Count zero values
    for col in results.columns:
        if pd.api.types.is_numeric_dtype(results[col]):
            zero_count = (results[col] == 0).sum()
            print(f"Column '{col}': {zero_count} zero values")

    results.to_csv('results_v2_updated.csv', index=False)
except FileNotFoundError:
    print("Warning: 'kingdom_weather_predictions_year.csv' not found. Skipping post-processing.")

Column 'kingdom_ID': 0 negative values
Column 'Year': 0 negative values
Column 'Month': 0 negative values
Column 'Day': 0 negative values
Column 'latitude': 0 negative values
Column 'longitude': 4590 negative values
Column 'Avg_Temperature': 0 negative values
Column 'Radiation': 350 negative values
Column 'Rain_Amount': 1402 negative values
Column 'Wind_Speed': 2 negative values
Column 'Wind_Direction': 7 negative values
Column 'kingdom_ID': 0 zero values
Column 'Year': 0 zero values
Column 'Month': 0 zero values
Column 'Day': 0 zero values
Column 'latitude': 0 zero values
Column 'longitude': 0 zero values
Column 'Avg_Temperature': 0 zero values
Column 'Radiation': 0 zero values
Column 'Rain_Amount': 1402 zero values
Column 'Wind_Speed': 0 zero values
Column 'Wind_Direction': 0 zero values
