In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('weather_features.csv', parse_dates=['date'])

In [3]:
list(df.columns)

['date',
 'latitude',
 'longitude',
 't2m_min_mean',
 't2m_min_std',
 't2m_min_min',
 't2m_min_max',
 't2m_min_first',
 't2m_min_last',
 't2m_min_trend',
 't2m_min_recent_3d',
 't2m_max_mean',
 't2m_max_std',
 't2m_max_min',
 't2m_max_max',
 't2m_max_first',
 't2m_max_last',
 't2m_max_trend',
 't2m_max_recent_3d',
 't2m_mean_mean',
 't2m_mean_std',
 't2m_mean_min',
 't2m_mean_max',
 't2m_mean_first',
 't2m_mean_last',
 't2m_mean_trend',
 't2m_mean_recent_3d',
 'd2m_min_mean',
 'd2m_min_std',
 'd2m_min_min',
 'd2m_min_max',
 'd2m_min_first',
 'd2m_min_last',
 'd2m_min_trend',
 'd2m_min_recent_3d',
 'd2m_max_mean',
 'd2m_max_std',
 'd2m_max_min',
 'd2m_max_max',
 'd2m_max_first',
 'd2m_max_last',
 'd2m_max_trend',
 'd2m_max_recent_3d',
 'd2m_mean_mean',
 'd2m_mean_std',
 'd2m_mean_min',
 'd2m_mean_max',
 'd2m_mean_first',
 'd2m_mean_last',
 'd2m_mean_trend',
 'd2m_mean_recent_3d',
 'msl_min_mean',
 'msl_min_std',
 'msl_min_min',
 'msl_min_max',
 'msl_min_first',
 'msl_min_last',
 'msl_mi

In [4]:
# Features and target
X = df.drop(columns=['t2m_min_next', 't2m_max_next', 't2m_mean_next'])
y = df[['t2m_min_next', 't2m_max_next', 't2m_mean_next']]

# Ensure date column exists and is datetime
X['date'] = pd.to_datetime(X['date'])

# Sort by date (CRITICAL!)
X = X.sort_values('date').reset_index(drop=True)
y = y.loc[X.index].reset_index(drop=True)  # Align targets

# Define split point by date
train_size = 0.8
split_index = int(len(X) * train_size)
split_date = X['date'].iloc[split_index]

print(f"Split date: {split_date}")
print(f"Train period: {X['date'].min()} to {X['date'].iloc[split_index-1]}")
print(f"Test period: {split_date} to {X['date'].max()}")

# Split data
X_train = X.iloc[:split_index]
X_test = X.iloc[split_index:]
y_train = y.iloc[:split_index]
y_test = y.iloc[split_index:]

print(f"\nTraining samples: {len(X_train):,}")
print(f"Test samples: {len(X_test):,}")

# IMPORTANT: Drop date, latitude, longitude before training
# (Keep them for analysis but don't train on them)
metadata_cols = ['date', 'latitude', 'longitude']
X_train_features = X_train.drop(columns=metadata_cols)
X_test_features = X_test.drop(columns=metadata_cols)

print(f"\nFeature columns for training: {X_train_features.shape[1]}")

Split date: 2025-05-26 00:00:00
Train period: 2024-01-08 00:00:00 to 2025-05-26 00:00:00
Test period: 2025-05-26 00:00:00 to 2025-09-29 00:00:00

Training samples: 1,143,792
Test samples: 285,948

Feature columns for training: 185


In [14]:
# -----------------------------------------------------------------
# ADD THIS CODE TO THE END OF YOUR SCRIPT
# -----------------------------------------------------------------

print("\n--- Saving processed data to CSV files ---")

# Define the output directory (optional, but good practice)
output_dir = "processed_data/"

# Create the directory if it doesn't exist
import os
os.makedirs(output_dir, exist_ok=True)

# Save the training and testing sets
# We use index=False to avoid saving the DataFrame index as a column
X_train_features.to_csv(os.path.join(output_dir, "train_features.csv"), index=False)
y_train.to_csv(os.path.join(output_dir, "train_targets.csv"), index=False)
X_test_features.to_csv(os.path.join(output_dir, "test_features.csv"), index=False)
y_test.to_csv(os.path.join(output_dir, "test_targets.csv"), index=False)

print(f"Successfully saved 4 files to the '{output_dir}' directory:")
print(f"1. train_features.csv ({X_train_features.shape[0]:,} rows, {X_train_features.shape[1]} columns)")
print(f"2. train_targets.csv ({y_train.shape[0]:,} rows, {y_train.shape[1]} columns)")
print(f"3. test_features.csv ({X_test_features.shape[0]:,} rows, {X_test_features.shape[1]} columns)")
print(f"4. test_targets.csv ({y_test.shape[0]:,} rows, {y_test.shape[1]} columns)")

# (Optional) Save the data with metadata for later analysis
# X_train.to_csv(os.path.join(output_dir, "X_train_with_metadata.csv"), index=False)
# X_test.to_csv(os.path.join(output_dir, "X_test_with_metadata.csv"), index=False)


--- Saving processed data to CSV files ---
Successfully saved 4 files to the 'processed_data/' directory:
1. train_features.csv (1,143,792 rows, 185 columns)
2. train_targets.csv (1,143,792 rows, 3 columns)
3. test_features.csv (285,948 rows, 185 columns)
4. test_targets.csv (285,948 rows, 3 columns)
