In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from cmdstanpy import CmdStanModel
import matplotlib.pyplot as plt
import os
import arviz as az

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Load preprocessed data
train_data = pd.read_csv('processed_train_data.csv')
test_data = pd.read_csv('processed_test_data.csv')

# Check if data is empty
if len(train_data) == 0:
    raise ValueError("Train data is empty. Check preprocessing step.")
if len(test_data) == 0:
    print("Warning: Test data is empty. Predictions will be skipped.")

In [None]:
# Define features and target
categorical_cols = ['season', 'is_raining', 'day_of_week', 'month']
numerical_cols = [col for col in train_data.columns if col not in ['date', 'passenger_count', 'season', 'is_raining', 'day_of_week', 'month']]
features = numerical_cols + categorical_cols
target = 'passenger_count'

In [None]:
# Prepare training and test sets
X_train = train_data[features]
y_train = train_data[target]
X_test = test_data[features] if len(test_data) > 0 else pd.DataFrame(columns=X_train.columns)
y_test = test_data[target] if len(test_data) > 0 else pd.Series()

In [None]:
# Preprocess categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ])
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test) if len(X_test) > 0 else np.array([])

In [None]:
# Get feature names after encoding
cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
feature_names = list(cat_feature_names) + numerical_cols

In [None]:
# Prior predictive checks for parameters
n_sim = 1000
beta0_sim = np.random.normal(0, 5, n_sim)
beta_sim = np.random.normal(0, 5, (n_sim, len(feature_names)))
sigma_sim = np.abs(np.random.standard_cauchy(n_sim) * 2.5)
print("Prior Parameter Checks:")
print(f"beta0 range: [{beta0_sim.min():.2f}, {beta0_sim.max():.2f}]")
print(f"beta range: [{beta_sim.min():.2f}, {beta_sim.max():.2f}]")
print(f"sigma range: [{sigma_sim.min():.2f}, {sigma_sim.max():.2f}]")

In [None]:
# Prior predictive checks for measurements
if len(X_train_transformed) > 0:
    mu_sim = beta0_sim + X_train_transformed[:n_sim] @ beta_sim.T
    y_sim = np.array([np.random.normal(mu_sim[i], sigma_sim[i]) for i in range(n_sim)])
    print(f"Prior Predictive Measurements range: [{y_sim.min():.2f}, {y_sim.max():.2f}]")
    plt.hist(y_sim.flatten(), bins=50, density=True)
    plt.title('Prior Predictive Distribution of Passenger Counts')
    plt.xlabel('Normalized Passenger Count')
    plt.ylabel('Density')
    plt.savefig('prior_predictive_measurements.png')
    plt.close()

In [None]:
# Prepare data for Stan
stan_data = {
    'N': len(X_train_transformed),
    'K': X_train_transformed.shape[1],
    'X': X_train_transformed,
    'y': y_train.values,
    'N_new': len(X_test_transformed) if len(X_test_transformed) > 0 else 0,
    'X_new': X_test_transformed if len(X_test_transformed) > 0 else np.zeros((0, len(feature_names)))
}

In [None]:
# Define Stan model with log-likelihood for arviz
stan_code = """
data {
  int<lower=0> N;          // Number of training samples
  int<lower=0> K;          // Number of features
  matrix[N, K] X;          // Feature matrix
  vector[N] y;             // Target variable
  int<lower=0> N_new;      // Number of test samples
  matrix[N_new, K] X_new;  // Test feature matrix
}
parameters {
  real beta0;              // Intercept
  vector[K] beta;          // Feature coefficients
  real<lower=0> sigma;     // Noise SD
}
model {
  vector[N] mu;
  // Priors
  beta0 ~ normal(0, 5);
  beta ~ normal(0, 5);
  sigma ~ cauchy(0, 2.5);

  // Likelihood
  for (n in 1:N) {
    mu[n] = beta0 + dot_product(X[n], beta);
  }
  y ~ normal(mu, sigma);
}
generated quantities {
  vector[N_new] y_pred;
  vector[N] log_lik;
  for (n in 1:N_new) {
    y_pred[n] = normal_rng(dot_product(X_new[n], beta) + beta0, sigma);
  }
  for (n in 1:N) {
    log_lik[n] = normal_lpdf(y[n] | dot_product(X[n], beta) + beta0, sigma);
  }
}
"""

# Save Stan model
with open('linear_regression.stan', 'w') as f:
    f.write(stan_code)

In [None]:
# Compile and fit model
model = CmdStanModel(stan_file='linear_regression.stan')
fit = model.sample(data=stan_data, chains=4, iter_sampling=1000, iter_warmup=500, seed=42)

In [None]:
# Check sampling diagnostics
summary = fit.summary()

# Print available columns in the summary DataFrame
print("Summary DataFrame columns:", summary.columns.tolist())

rhat = summary['R_hat'].max()

# Check if 'N_Eff' column exists before accessing it
if 'N_Eff' in summary.columns:
    n_eff = summary['N_Eff'].min()
    print(f"Sampling Diagnostics: Max R-hat = {rhat:.4f}, Min N_Eff = {n_eff:.0f}")
    if rhat > 1.1 or n_eff < 100:
        print("Warning: Sampling issues detected. Consider increasing iter_sampling or adapt_delta.")
else:
    print(f"Sampling Diagnostics: Max R-hat = {rhat:.4f}. N_Eff column not found in summary.")
    if rhat > 1.1:
         print("Warning: High R-hat detected. Sampling issues likely. Consider increasing iter_sampling or adapt_delta.")

In [None]:
# Posterior predictive analysis
if len(X_test_transformed) > 0:
    y_pred = fit.stan_variable('y_pred').mean(axis=0)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Posterior Predictive Metrics:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R² Score: {r2:.4f}")
    # Check consistency
    y_pred_samples = fit.stan_variable('y_pred')
    credible_intervals = np.percentile(y_pred_samples, [2.5, 97.5], axis=0)
    within_ci = np.mean((y_test >= credible_intervals[0]) & (y_test <= credible_intervals[1]))
    print(f"Proportion of test data within 95% credible intervals: {within_ci:.2f}")
    if within_ci < 0.9:
        print("Warning: Less than 90% of test data within credible intervals. Model may miss non-linear effects or key features.")

In [None]:
# Posterior predictive analysis
if len(X_test_transformed) > 0:
    y_pred = fit.stan_variable('y_pred').mean(axis=0)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Posterior Predictive Metrics (Model 1):")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R² Score: {r2:.4f}")
    # Check consistency
    y_pred_samples = fit.stan_variable('y_pred')
    credible_intervals = np.percentile(y_pred_samples, [2.5, 97.5], axis=0)
    within_ci = np.mean((y_test >= credible_intervals[0]) & (y_test <= credible_intervals[1]))
    print(f"Proportion of test data within 95% credible intervals: {within_ci:.2f}")
    if within_ci < 0.9:
        print("Warning: Less than 90% of test data within credible intervals. Consider non-linear effects or additional features.")

    # Visualization: Actual vs. Predicted Passenger Counts
    if 'date' in test_data.columns:
        plt.figure(figsize=(12, 6))
        plt.plot(test_data['date'], y_test, label='Actual', color='blue')
        plt.plot(test_data['date'], y_pred, label='Predicted', color='red', alpha=0.7)
        plt.title('Actual vs. Predicted Passenger Counts (Model 1)')
        plt.xlabel('Date')
        plt.ylabel('Normalized Passenger Count')
        plt.legend()
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig('actual_vs_predicted_model1.png')
        plt.close()
    else:
        plt.figure(figsize=(12, 6))
        plt.plot(y_test.index, y_test, label='Actual', color='blue')
        plt.plot(y_test.index, y_pred, label='Predicted', color='red', alpha=0.7)
        plt.title('Actual vs. Predicted Passenger Counts (Model 1)')
        plt.xlabel('Index')
        plt.ylabel('Normalized Passenger Count')
        plt.legend()
        plt.tight_layout()
        plt.savefig('actual_vs_predicted_model1.png')
        plt.close()

    # Visualization: Residuals Plot
    residuals = y_test - y_pred
    plt.figure(figsize=(12, 6))
    plt.scatter(test_data['date'] if 'date' in test_data.columns else y_test.index, residuals, color='purple', alpha=0.5)
    plt.axhline(y=0, color='black', linestyle='--', alpha=0.3)
    plt.title('Residuals of Predicted vs. Actual Passenger Counts (Model 1)')
    plt.xlabel('Date' if 'date' in test_data.columns else 'Index')
    plt.ylabel('Residuals')
    plt.tight_layout()
    plt.savefig('residuals_model1.png')
    plt.close()

In [None]:
# Parameter marginal distributions
beta_samples = fit.stan_variable('beta')
beta0_samples = fit.stan_variable('beta0')
sigma_samples = fit.stan_variable('sigma')
print("\nParameter Summaries:")
for i, name in enumerate(feature_names):
    mean, std = beta_samples[:, i].mean(), beta_samples[:, i].std()
    ci = np.percentile(beta_samples[:, i], [2.5, 97.5])
    print(f"{name}: Mean = {mean:.4f}, SD = {std:.4f}, 95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")
print(f"beta0: Mean = {beta0_samples.mean():.4f}, SD = {beta0_samples.std():.4f}, 95% CI = [{np.percentile(beta0_samples, 2.5):.4f}, {np.percentile(beta0_samples, 97.5):.4f}]")
print(f"sigma: Mean = {sigma_samples.mean():.4f}, SD = {sigma_samples.std():.4f}, 95% CI = [{np.percentile(sigma_samples, 2.5):.4f}, {np.percentile(sigma_samples, 97.5):.4f}]")

In [None]:
# Plot parameter histograms
plt.figure(figsize=(12, 8))
for i, name in enumerate(feature_names[:4]):  # Plot first 4 for brevity
    plt.subplot(10, 4, i+1)
    plt.hist(beta_samples[:, i], bins=30, density=True)
    plt.title(f'Posterior: {name}')
    plt.xlabel('Value')
    plt.ylabel('Density')
plt.tight_layout()
plt.savefig('parameter_histograms.png')
plt.show()

In [None]:
# Compute information criteria with arviz using from_cmdstanpy
idata = az.from_cmdstanpy(posterior=fit, log_likelihood='log_lik')
waic = az.waic(idata)
loo = az.loo(idata)
print(f"\nInformation Criteria (Model 1):")
print(f"WAIC: {waic.elpd_waic} (+/- {waic.se})")
print(f"PSIS-LOO: {loo.elpd_loo} (+/- {loo.se})")
if any(loo.pareto_k > 0.7):
    print("Warning: High Pareto k values detected. Results may be unreliable.")