In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import warnings

warnings.filterwarnings('ignore')

# Add src to path
import sys
sys.path.append('..')

from src.config import (
    SELECTED_DATA_PATH, SEQUENCES_DIR, SCALER_PATH,
    TARGET_COLUMN, DATE_COLUMN,
    INPUT_SEQ_LEN, OUTPUT_SEQ_LEN,
    TRAIN_RATIO, VAL_RATIO, TEST_RATIO,
    RANDOM_SEED, set_seed
)
from src.dataset import create_sequences, train_val_test_split
from src.utils import save_numpy, save_pickle

# Set random seed
set_seed(RANDOM_SEED)

print("Libraries imported successfully!")
print(f"Input sequence length: {INPUT_SEQ_LEN}")
print(f"Output sequence length: {OUTPUT_SEQ_LEN}")

## 5.1 Load Selected Features Data

In [None]:
# Load data
df = pd.read_csv(SELECTED_DATA_PATH, parse_dates=[DATE_COLUMN])
print(f"Loaded data shape: {df.shape}")
df.head()

In [None]:
# Get feature columns (exclude date_time)
feature_columns = [c for c in df.columns if c != DATE_COLUMN]
print(f"Feature columns: {len(feature_columns)}")
print(feature_columns)

In [None]:
# Find target column index
target_idx = feature_columns.index(TARGET_COLUMN)
print(f"Target column '{TARGET_COLUMN}' index: {target_idx}")

## 5.2 Data Scaling

In [None]:
# Extract data values (without date_time)
data = df[feature_columns].values
print(f"Data shape: {data.shape}")

In [None]:
# Time-based split indices
n_samples = len(data)
train_end = int(n_samples * TRAIN_RATIO)
val_end = int(n_samples * (TRAIN_RATIO + VAL_RATIO))

print(f"Total samples: {n_samples:,}")
print(f"Train: 0 to {train_end:,}")
print(f"Val: {train_end:,} to {val_end:,}")
print(f"Test: {val_end:,} to {n_samples:,}")

In [None]:
# Fit scaler on TRAINING data only
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(data[:train_end])

# Transform all data
data_scaled = scaler.transform(data)

print(f"Data scaled. Min: {data_scaled.min():.4f}, Max: {data_scaled.max():.4f}")

In [None]:
# Verify scaling
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Before scaling
axes[0].hist(data[:, target_idx], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Traffic Volume - Before Scaling')
axes[0].set_xlabel('Value')

# After scaling
axes[1].hist(data_scaled[:, target_idx], bins=50, edgecolor='black', alpha=0.7)
axes[1].set_title('Traffic Volume - After Scaling')
axes[1].set_xlabel('Value')

plt.tight_layout()
plt.show()

## 5.3 Create Sequences

In [None]:
# Create sequences
X, y = create_sequences(
    data=data_scaled,
    target_idx=target_idx,
    input_seq_len=INPUT_SEQ_LEN,
    output_seq_len=OUTPUT_SEQ_LEN
)

print(f"\nX shape: {X.shape}  (samples, input_seq_len, n_features)")
print(f"y shape: {y.shape}  (samples, output_seq_len)")

In [None]:
# Visualize a sample sequence
sample_idx = 1000

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Input sequence (target column)
axes[0].plot(range(INPUT_SEQ_LEN), X[sample_idx, :, target_idx], 'b-o', label='Input')
axes[0].set_xlabel('Time Step')
axes[0].set_ylabel('Scaled Value')
axes[0].set_title(f'Sample {sample_idx}: Input Sequence ({INPUT_SEQ_LEN} steps)')
axes[0].legend()

# Full sequence (input + output)
axes[1].plot(range(INPUT_SEQ_LEN), X[sample_idx, :, target_idx], 'b-o', label='Input')
axes[1].plot(range(INPUT_SEQ_LEN, INPUT_SEQ_LEN + OUTPUT_SEQ_LEN), y[sample_idx], 'r-o', label='Target')
axes[1].axvline(x=INPUT_SEQ_LEN - 0.5, color='gray', linestyle='--')
axes[1].set_xlabel('Time Step')
axes[1].set_ylabel('Scaled Value')
axes[1].set_title(f'Sample {sample_idx}: Full Sequence (Input + Target)')
axes[1].legend()

plt.tight_layout()
plt.show()

## 5.4 Train/Validation/Test Split

In [None]:
# Split data (time-based, no shuffle)
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(
    X, y,
    train_ratio=TRAIN_RATIO,
    val_ratio=VAL_RATIO
)

In [None]:
# Verify shapes
print("\nData Shapes:")
print("=" * 50)
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_val:   {X_val.shape}")
print(f"y_val:   {y_val.shape}")
print(f"X_test:  {X_test.shape}")
print(f"y_test:  {y_test.shape}")
print("=" * 50)

## 5.5 Save Data

In [None]:
import os

# Create sequences directory if not exists
os.makedirs(SEQUENCES_DIR, exist_ok=True)

# Save sequences
save_numpy(X_train, os.path.join(SEQUENCES_DIR, 'X_train.npy'))
save_numpy(y_train, os.path.join(SEQUENCES_DIR, 'y_train.npy'))
save_numpy(X_val, os.path.join(SEQUENCES_DIR, 'X_val.npy'))
save_numpy(y_val, os.path.join(SEQUENCES_DIR, 'y_val.npy'))
save_numpy(X_test, os.path.join(SEQUENCES_DIR, 'X_test.npy'))
save_numpy(y_test, os.path.join(SEQUENCES_DIR, 'y_test.npy'))

# Save scaler
save_pickle(scaler, SCALER_PATH)

# Save metadata
metadata = {
    'input_seq_len': INPUT_SEQ_LEN,
    'output_seq_len': OUTPUT_SEQ_LEN,
    'n_features': X_train.shape[2],
    'target_idx': target_idx,
    'feature_columns': feature_columns,
    'train_samples': len(X_train),
    'val_samples': len(X_val),
    'test_samples': len(X_test)
}

from src.utils import save_json
save_json(metadata, os.path.join(SEQUENCES_DIR, 'metadata.json'))

print("\nAll data saved successfully!")

## 5.6 Verify Saved Data

In [None]:
# Load and verify
X_train_loaded = np.load(os.path.join(SEQUENCES_DIR, 'X_train.npy'))
y_train_loaded = np.load(os.path.join(SEQUENCES_DIR, 'y_train.npy'))

print(f"Loaded X_train shape: {X_train_loaded.shape}")
print(f"Loaded y_train shape: {y_train_loaded.shape}")
print(f"\nData matches: {np.allclose(X_train, X_train_loaded) and np.allclose(y_train, y_train_loaded)}")

## Summary

**Data Preparation completed:**
1. ✅ Loaded selected features data
2. ✅ Scaled data using MinMaxScaler (fit on train only)
3. ✅ Created input/output sequences
4. ✅ Split into train/val/test (time-based)
5. ✅ Saved sequences as numpy arrays
6. ✅ Saved scaler for inverse transform

**Data Summary:**
- Input sequence length: 24 timesteps
- Output sequence length: 5 timesteps
- Number of features: varies based on selection

**Next step:** Model Training (06_Model_Training.ipynb)