# Speedband Prediction Model Training

This notebook trains an XGBoost model to predict the next speedband value for each link in the traffic network.

## Steps:
1. Setup and Data Loading
2. Data Analysis
3. Data Preprocessing
4. Feature Engineering
5. Train/Validation/Test Split
6. Model Training
7. Model Evaluation
8. Model Saving

In [None]:
# Install required packages
%pip install pandas numpy scikit-learn xgboost joblib pyarrow -q

In [None]:
# Import libraries
import os
import pandas as pd
import numpy as np
from datetime import datetime
from typing import Tuple, Dict, Any
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries imported successfully!")

## Step 1: Upload Data File

Upload the `correlated_traffic_data.parquet` file using the file uploader below, or mount Google Drive if your file is stored there.

In [None]:
# Option 1: Upload file directly
from google.colab import files

# Uncomment the line below to upload file
# uploaded = files.upload()
# PARQUET_FILE = list(uploaded.keys())[0]

# Option 2: Mount Google Drive (recommended for large files)
from google.colab import drive
drive.mount('/content/drive')

# Update this path to your parquet file location in Google Drive
PARQUET_FILE = '/content/drive/MyDrive/correlated_traffic_data.parquet'

# Or if you uploaded directly, use:
# PARQUET_FILE = '/content/correlated_traffic_data.parquet'

print(f"Parquet file path: {PARQUET_FILE}")

## Step 2: Load and Analyze Data

In [None]:
# Load data
print("Loading data...")
df = pd.read_parquet(PARQUET_FILE)
print(f"Loaded {len(df):,} rows")
print(f"Columns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Basic data info
print("=" * 80)
print("Data Overview")
print("=" * 80)
print(f"Shape: {df.shape[0]:,} rows Ã— {df.shape[1]} columns")
print(f"\nData types:")
print(df.dtypes)
print(f"\nMissing values:")
print(df.isnull().sum())
print(f"\nBasic statistics:")
df.describe()

In [None]:
# Time series analysis
df['generated_at'] = pd.to_datetime(df['generated_at'])
print("Time Series Analysis:")
print(f"Date range: {df['generated_at'].min()} to {df['generated_at'].max()}")
print(f"Total time span: {df['generated_at'].max() - df['generated_at'].min()}")
print(f"Unique timestamps: {df['generated_at'].nunique()}")
print(f"Average records per timestamp: {len(df) / df['generated_at'].nunique():.1f}")
print(f"\nUnique links: {df['LinkID'].nunique()}")
print(f"Records per link: {df['LinkID'].value_counts().describe()}")

In [None]:
# Speedband distribution
print("Speedband Distribution:")
print(df['speedband'].value_counts().sort_index())
print(f"\nSpeedband statistics:")
print(f"Min: {df['speedband'].min()}, Max: {df['speedband'].max()}")
print(f"Mean: {df['speedband'].mean():.2f}, Std: {df['speedband'].std():.2f}")

# Visualize speedband distribution
plt.figure(figsize=(10, 6))
df['speedband'].value_counts().sort_index().plot(kind='bar')
plt.title('Speedband Distribution')
plt.xlabel('Speedband')
plt.ylabel('Count')
plt.show()

In [None]:
# Correlation analysis
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols) > 1:
    corr_matrix = df[numeric_cols].corr()
    print("Correlation Matrix:")
    print(corr_matrix)
    
    # Visualize correlation
    plt.figure(figsize=(8, 6))
    sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0)
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Correlations with speedband
    if 'speedband' in corr_matrix.columns:
        print("\nCorrelations with speedband:")
        speedband_corr = corr_matrix['speedband'].sort_values(ascending=False, key=abs)
        for col, corr in speedband_corr.items():
            if col != 'speedband':
                print(f"  {col}: {corr:.4f}")

## Step 3: Data Preprocessing

In [None]:
# Preprocess data: convert timestamps, sort
print("=" * 80)
print("Step 1: Data Preprocessing")
print("=" * 80)

# Convert generated_at to datetime (if not already)
df['generated_at'] = pd.to_datetime(df['generated_at'])

# Sort by LinkID and timestamp to ensure proper ordering
print("Sorting by LinkID and timestamp...")
df = df.sort_values(['LinkID', 'generated_at']).reset_index(drop=True)

print(f"Data shape: {df.shape}")
print(f"Date range: {df['generated_at'].min()} to {df['generated_at'].max()}")

## Step 4: Feature Engineering

In [None]:
# Create features
print("=" * 80)
print("Step 2: Feature Engineering")
print("=" * 80)

# Time-based features
print("Creating time-based features...")
df['hour'] = df['generated_at'].dt.hour
df['minute'] = df['generated_at'].dt.minute

print("Creating lag features and link-specific features...")
print("(This may take a while for large datasets...)")

In [None]:
# Function to create features for each link
def create_link_features(group):
    """Create features for a single link."""
    group = group.sort_values('generated_at').reset_index(drop=True)
    
    # Lag features (PRIMARY FEATURES)
    group['speedband_lag1'] = group['speedband'].shift(1)
    group['speedband_lag2'] = group['speedband'].shift(2)
    group['speedband_lag3'] = group['speedband'].shift(3)
    group['speedband_lag5'] = group['speedband'].shift(5)
    
    # Rolling statistics over windows
    for window in [3, 5, 10]:
        group[f'speedband_rolling_mean_{window}'] = group['speedband'].shift(1).rolling(window=window, min_periods=1).mean()
        group[f'speedband_rolling_std_{window}'] = group['speedband'].shift(1).rolling(window=window, min_periods=1).std().fillna(0)
        group[f'speedband_rolling_min_{window}'] = group['speedband'].shift(1).rolling(window=window, min_periods=1).min()
        group[f'speedband_rolling_max_{window}'] = group['speedband'].shift(1).rolling(window=window, min_periods=1).max()
    
    # Number of changes in rolling window
    group['speedband_changes_3'] = (group['speedband'].shift(1).diff() != 0).rolling(window=3, min_periods=1).sum()
    group['speedband_changes_5'] = (group['speedband'].shift(1).diff() != 0).rolling(window=5, min_periods=1).sum()
    
    # Speedband change rate
    group['speedband_diff'] = group['speedband'].shift(1).diff().fillna(0)
    
    # Link-specific features
    # Historical average (using all previous data)
    group['link_avg_speedband'] = group['speedband'].shift(1).expanding().mean()
    group['link_std_speedband'] = group['speedband'].shift(1).expanding().std().fillna(0)
    
    # Rolling average of rainfall
    group['rainfall_rolling_mean_3'] = group['rainfall_mm'].shift(1).rolling(window=3, min_periods=1).mean()
    group['rainfall_rolling_mean_5'] = group['rainfall_mm'].shift(1).rolling(window=5, min_periods=1).mean()
    
    # Target variable: next speedband value
    group['target'] = group['speedband'].shift(-1)
    
    return group

# Apply feature engineering to all links
print("Processing links (this may take a while)...")
df = df.groupby('LinkID', group_keys=False).apply(create_link_features).reset_index(drop=True)
print("Feature engineering complete!")

In [None]:
# Fill NaN values in lag features (first few rows per link)
lag_cols = [col for col in df.columns if 'lag' in col or 'rolling' in col or 'link_' in col]
for col in lag_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median() if df[col].dtype in [np.float64, np.int64] else 0)

# Encode LinkID as categorical (using numeric encoding for XGBoost)
print("Encoding LinkID...")
df['LinkID_encoded'] = pd.Categorical(df['LinkID']).codes

# Convert boolean to int
df['has_incident'] = df['has_incident'].astype(int)

# Drop rows where target is NaN (last row of each link)
print("Dropping rows with missing target...")
initial_rows = len(df)
df = df.dropna(subset=['target']).reset_index(drop=True)
print(f"Dropped {initial_rows - len(df):,} rows with missing target")

print(f"Final feature matrix shape: {df.shape}")
feature_cols = [c for c in df.columns if c not in ['LinkID', 'generated_at', 'speedband', 'target']]
print(f"Features created: {len(feature_cols)}")
print(f"Feature columns: {feature_cols[:10]}...")  # Show first 10

## Step 5: Train/Validation/Test Split

In [None]:
# Split data into train/validation/test sets using time-based split
print("=" * 80)
print("Step 3: Train/Validation/Test Split")
print("=" * 80)

train_dfs = []
val_dfs = []
test_dfs = []

print("Splitting by link (time-based split)...")
for link_id, group in df.groupby('LinkID'):
    group = group.sort_values('generated_at').reset_index(drop=True)
    n = len(group)
    
    # Split indices: 70% train, 15% val, 15% test
    train_end = int(n * 0.70)
    val_end = int(n * 0.85)
    
    train_dfs.append(group.iloc[:train_end])
    val_dfs.append(group.iloc[train_end:val_end])
    test_dfs.append(group.iloc[val_end:])

train_df = pd.concat(train_dfs, ignore_index=True)
val_df = pd.concat(val_dfs, ignore_index=True)
test_df = pd.concat(test_dfs, ignore_index=True)

print(f"Training set: {len(train_df):,} rows ({len(train_df)/len(df)*100:.1f}%)")
print(f"Validation set: {len(val_df):,} rows ({len(val_df)/len(df)*100:.1f}%)")
print(f"Test set: {len(test_df):,} rows ({len(test_df)/len(df)*100:.1f}%)")

In [None]:
# Prepare feature matrices
exclude_cols = ['LinkID', 'generated_at', 'speedband', 'target']
feature_cols = [col for col in df.columns if col not in exclude_cols]

X_train = train_df[feature_cols]
y_train = train_df['target']
X_val = val_df[feature_cols]
y_val = val_df['target']
X_test = test_df[feature_cols]
y_test = test_df['target']

print(f"Feature matrix shape: {X_train.shape}")
print(f"Number of features: {len(feature_cols)}")

## Step 6: Model Training

In [None]:
# Train XGBoost model
print("=" * 80)
print("Step 4: Model Training")
print("=" * 80)

print("Training XGBoost regressor...")

# XGBoost parameters
params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'learning_rate': 0.1,
    'n_estimators': 500,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 3,
    'gamma': 0.1,
    'random_state': 42,
    'n_jobs': -1,
}

model = xgb.XGBRegressor(**params)

# Train with early stopping
print("Training with early stopping on validation set...")
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=20,
    verbose=100
)

print(f"\nBest iteration: {model.best_iteration}")
print(f"Best score: {model.best_score:.4f}")

## Step 7: Model Evaluation

In [None]:
# Evaluate model
print("=" * 80)
print("Step 6: Model Evaluation")
print("=" * 80)

def evaluate_model(model, X, y, dataset_name):
    """Evaluate model and return metrics."""
    print(f"\nEvaluating on {dataset_name} set...")
    
    # Predict
    y_pred = model.predict(X)
    
    # Clip predictions to valid range [0, 8]
    y_pred = np.clip(y_pred, 0, 8)
    
    # Calculate metrics
    mae = mean_absolute_error(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    mape = np.mean(np.abs((y - y_pred) / (y + 1e-8))) * 100
    y_pred_rounded = np.round(y_pred).astype(int)
    accuracy = (y_pred_rounded == y).mean() * 100
    
    metrics = {
        'MAE': mae,
        'RMSE': rmse,
        'MAPE': mape,
        'Accuracy': accuracy
    }
    
    print(f"  MAE: {mae:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAPE: {mape:.2f}%")
    print(f"  Accuracy (exact match): {accuracy:.2f}%")
    
    return metrics, y_pred

# Evaluate on all sets
train_metrics, y_train_pred = evaluate_model(model, X_train, y_train, "Training")
val_metrics, y_val_pred = evaluate_model(model, X_val, y_val, "Validation")
test_metrics, y_test_pred = evaluate_model(model, X_test, y_test, "Test")

In [None]:
# Visualize predictions vs actual
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (name, y_true, y_pred) in enumerate([
    ('Training', y_train, y_train_pred),
    ('Validation', y_val, y_val_pred),
    ('Test', y_test, y_test_pred)
]):
    axes[idx].scatter(y_true, y_pred, alpha=0.1, s=1)
    axes[idx].plot([0, 8], [0, 8], 'r--', lw=2)
    axes[idx].set_xlabel('Actual Speedband')
    axes[idx].set_ylabel('Predicted Speedband')
    axes[idx].set_title(f'{name} Set Predictions')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Feature importance
print("\nTop 20 Most Important Features:")
print("-" * 60)

importance = model.feature_importances_
indices = np.argsort(importance)[::-1][:20]

for i, idx in enumerate(indices, 1):
    print(f"{i:2d}. {feature_cols[idx]:40s} {importance[idx]:.4f}")

# Visualize feature importance
plt.figure(figsize=(10, 8))
top_features = [feature_cols[idx] for idx in indices]
top_importance = [importance[idx] for idx in indices]
plt.barh(range(len(top_features)), top_importance)
plt.yticks(range(len(top_features)), top_features)
plt.xlabel('Feature Importance')
plt.title('Top 20 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Step 8: Save Model

In [None]:
# Save model
print("=" * 80)
print("Step 5: Saving Model")
print("=" * 80)

# Create models directory
MODEL_DIR = '/content/models'
os.makedirs(MODEL_DIR, exist_ok=True)

MODEL_FILE = os.path.join(MODEL_DIR, 'speedband_model.joblib')
FEATURE_NAMES_FILE = os.path.join(MODEL_DIR, 'feature_names.txt')

print(f"Saving model to {MODEL_FILE}...")
joblib.dump(model, MODEL_FILE)

print(f"Saving feature names to {FEATURE_NAMES_FILE}...")
with open(FEATURE_NAMES_FILE, 'w') as f:
    for name in feature_cols:
        f.write(f"{name}\n")

print("Model saved successfully!")

In [None]:
# Download model files (optional - to save to local machine)
from google.colab import files

# Download model
files.download(MODEL_FILE)

# Download feature names
files.download(FEATURE_NAMES_FILE)

print("Files downloaded!")

## Training Summary

In [None]:
# Print final summary
print("=" * 80)
print("Training Summary")
print("=" * 80)
print(f"Training MAE: {train_metrics['MAE']:.4f}, Accuracy: {train_metrics['Accuracy']:.2f}%")
print(f"Validation MAE: {val_metrics['MAE']:.4f}, Accuracy: {val_metrics['Accuracy']:.2f}%")
print(f"Test MAE: {test_metrics['MAE']:.4f}, Accuracy: {test_metrics['Accuracy']:.2f}%")
print(f"\nModel saved to: {MODEL_FILE}")
print(f"End time: {datetime.now()}")
print("=" * 80)