In [23]:
import pandas as pd
from statsmodels.tsa.api import VAR
import numpy as np
import warnings

# Suppress pandas warnings
warnings.filterwarnings('ignore', message='Could not infer format')
warnings.filterwarnings('ignore', message='A date index has been provided')

In [29]:
import pandas as pd
from statsmodels.tsa.api import VAR
import numpy as np
import warnings

# Suppress pandas warnings
warnings.filterwarnings('ignore', message='Could not infer format')
warnings.filterwarnings('ignore', message='A date index has been provided')

try:
    # 1. Load the data
    print("Loading data...")
    df = pd.read_csv(
        'daily_trans_report.csv',
        header=None,
        names=[
            'date',
            'Local_Route',
            'Light_Rail',
            'Peak_Service',
            'Rapid_Route',
            'School',
            'Other'
        ]
    )

    print("Raw data shape:", df.shape)
    print("First few rows:")
    print(df.head())

    # 2. Parse dates with explicit format handling
    print("\nParsing dates...")
    date_col = df['date'].astype(str)

    # Try common date formats
    date_formats = [
        '%d/%m/%Y', '%d-%m-%Y', '%Y-%m-%d', '%m/%d/%Y',
        '%d/%m/%y', '%d-%m-%y', '%y-%m-%d', '%m/%d/%y'
    ]

    parsed_dates = None
    for fmt in date_formats:
        try:
            parsed_dates = pd.to_datetime(date_col, format=fmt, errors='raise')
            print(f"✓ Parsed dates using format: {fmt}")
            break
        except:
            continue

    # Fallback to automatic parsing if needed
    if parsed_dates is None:
        print("Trying automatic date parsing...")
        parsed_dates = pd.to_datetime(date_col, errors='coerce', dayfirst=True)

    df['date'] = parsed_dates

    # Check for failed date parsing
    if df['date'].isna().sum() > 0:
        print(f"Warning: {df['date'].isna().sum()} dates could not be parsed")
        df = df.dropna(subset=['date'])

    # 3. Prepare numeric data
    print(f"\nPreparing numeric data...")
    numeric_cols = ['Local_Route', 'Light_Rail', 'Peak_Service', 'Rapid_Route', 'School']

    # Convert to numeric
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Set date as index
    df = df.set_index('date')[numeric_cols]

    # Sort by date to ensure monotonic index
    df = df.sort_index()

    # Remove duplicates (keep first occurrence)
    df = df[~df.index.duplicated(keep='first')]

    print(f"Data types after conversion:")
    print(df.dtypes)
    print(f"\nMissing values:")
    print(df.isnull().sum())

    # 4. Clean the data
    print(f"\nCleaning data...")
    initial_shape = df.shape

    # Remove rows with missing values
    df_clean = df.dropna()

    # Remove rows with infinite values
    df_clean = df_clean.replace([np.inf, -np.inf], np.nan).dropna()

    print(f"Shape before cleaning: {initial_shape}")
    print(f"Shape after cleaning: {df_clean.shape}")
    print(f"Removed {initial_shape[0] - df_clean.shape[0]} rows")

    if len(df_clean) < 10:
        raise ValueError(f"Insufficient data: only {len(df_clean)} valid rows remaining")

    # 5. Display data summary
    print(f"\nFinal dataset summary:")
    print(f"Date range: {df_clean.index.min().strftime('%Y-%m-%d')} to {df_clean.index.max().strftime('%Y-%m-%d')}")
    print(f"Total observations: {len(df_clean)}")
    print(f"\nSample data:")
    print(df_clean.head())
    print(f"\nDescriptive statistics:")
    print(df_clean.describe().round(2))

    # 6. Fit VAR model
    print(f"\n{'='*50}")
    print("FITTING VAR MODEL")
    print('='*50)

    # Create VAR model
    model = VAR(df_clean.values)  # Use values to avoid date index issues

    # Determine max lags
    max_lags = min(12, len(df_clean) // 5)
    print(f"Maximum lags to consider: {max_lags}")

    # Fit model with lag selection
    try:
        results = model.fit(maxlags=max_lags, ic='aic')
        selection_method = "AIC"
    except:
        print("AIC selection failed, trying BIC...")
        try:
            results = model.fit(maxlags=max_lags, ic='bic')
            selection_method = "BIC"
        except:
            print("Automatic selection failed, using 2 lags...")
            results = model.fit(maxlags=2)
            selection_method = "Fixed"

    print(f"✓ Model fitted successfully!")
    print(f"Selection method: {selection_method}")
    print(f"Selected lags: {results.k_ar}")
    print(f"Observations used: {results.nobs}")
    print(f"AIC: {results.aic:.2f}")
    print(f"BIC: {results.bic:.2f}")

    # 7. Generate forecasts
    print(f"\n{'='*50}")
    print("GENERATING FORECASTS")
    print('='*50)

    # Prepare forecast input
    lag_order = results.k_ar
    forecast_input = df_clean.values[-lag_order:]

    print(f"Using last {lag_order} observations for forecasting")

    # Generate 7-day forecast
    forecast_steps = 7
    fc_values = results.forecast(y=forecast_input, steps=forecast_steps)

    # Create forecast dates
    last_date = df_clean.index[-1]
    forecast_dates = pd.date_range(
        start=last_date + pd.Timedelta(days=1),
        periods=forecast_steps,
        freq='D'
    )

    # Create forecast DataFrame
    forecast_df = pd.DataFrame(
        fc_values,
        index=forecast_dates,
        columns=df_clean.columns
    )

    # Ensure non-negative forecasts (assuming these represent counts)
    forecast_df = forecast_df.clip(lower=0)

    # 8. Display results
    print(f"\n{'='*60}")
    print("FORECAST RESULTS - NEXT 7 DAYS")
    print('='*60)

    # Format the forecast nicely
    forecast_display = forecast_df.copy()
    for col in forecast_display.columns:
        forecast_display[col] = forecast_display[col].round(1)

    print(forecast_display.to_string())

    # Summary statistics
    print(f"\n{'='*50}")
    print("FORECAST SUMMARY")
    print('='*50)

    total_forecast = forecast_df.sum(axis=1)
    print(f"Daily totals:")
    for date, total in total_forecast.items():
        print(f"  {date.strftime('%Y-%m-%d')}: {total:.1f}")

    print(f"\nAverage daily forecast by category:")
    for col in forecast_df.columns:
        avg_forecast = forecast_df[col].mean()
        print(f"  {col}: {avg_forecast:.1f}")

    print(f"\nTotal 7-day forecast: {forecast_df.sum().sum():.1f}")

    # Model diagnostics
    print(f"\n{'='*50}")
    print("MODEL DIAGNOSTICS")
    print('='*50)
    print(f"Model: VAR({results.k_ar})")
    print(f"Variables: {len(df_clean.columns)}")
    print(f"Sample period: {df_clean.index.min().strftime('%Y-%m-%d')} to {df_clean.index.max().strftime('%Y-%m-%d')}")
    print(f"Training observations: {results.nobs}")
    print(f"Forecast horizon: {forecast_steps} days")

except FileNotFoundError:
    print("❌ Error: 'data.csv' file not found.")
    print("Please check that the file exists in the current directory.")

except pd.errors.EmptyDataError:
    print("❌ Error: The CSV file is empty.")

except pd.errors.ParserError as e:
    print(f"❌ Error parsing CSV: {e}")
    print("Please check the file format.")

except ValueError as e:
    print(f"❌ Data Error: {e}")

except Exception as e:
    print(f"❌ Unexpected error: {type(e).__name__}: {e}")
    import traceback
    traceback.print_exc()

finally:
    # Reset warnings
    warnings.resetwarnings()

print(f"\n{'='*50}")
print("ANALYSIS COMPLETE")
print('='*50)

Loading data...
Raw data shape: (1919, 7)
First few rows:
         date  Local_Route  Light_Rail  Peak_Service  Rapid_Route  School  \
0        Date  Local Route  Light Rail  Peak Service  Rapid Route  School   
1  30/08/2024        16436       10705           225        19026    3925   
2  15/09/2023        15499       10671           267        18421    4519   
3  28/12/2021         1756        2352             0         3775       0   
4  11/01/2023        10536        8347           223        14072       0   

   Other  
0  Other  
1     59  
2     61  
3     13  
4     48  

Parsing dates...
Trying automatic date parsing...

Preparing numeric data...
Data types after conversion:
Local_Route     int64
Light_Rail      int64
Peak_Service    int64
Rapid_Route     int64
School          int64
dtype: object

Missing values:
Local_Route     0
Light_Rail      0
Peak_Service    0
Rapid_Route     0
School          0
dtype: int64

Cleaning data...
Shape before cleaning: (1918, 5)
Shape after