In [18]:
# Bus Demand Forecasting - Baseline Model with Holiday Features

import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import holidays

# Load Data
train = pd.read_csv('train.csv', parse_dates=['doj'])
test = pd.read_csv('test.csv', parse_dates=['doj'])
transactions = pd.read_csv('transactions.csv', parse_dates=['doj', 'doi'])
submission = pd.read_csv('submission_file.csv')

# Filter only rows with dbd = 15
transactions_15dbd = transactions[transactions['dbd'] == 15]

# Merge transactions_15dbd with train and test
train_df = pd.merge(train, transactions_15dbd, on=['doj', 'srcid', 'destid'], how='left')
test_df = pd.merge(test, transactions_15dbd, on=['doj', 'srcid', 'destid'], how='left')

# ---------------------- Feature Engineering ----------------------

# Date-based features
def add_date_features(df):
    df['day_of_week'] = df['doj'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5,6]).astype(int)
    df['month'] = df['doj'].dt.month
    df['is_month_start'] = df['doj'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['doj'].dt.is_month_end.astype(int)
    return df

# Holiday-based features
def add_holiday_features(df):
    indian_holidays = holidays.India(years=[2024, 2025])

    # Convert holiday dates to pandas Timestamps for comparison
    holiday_timestamps = pd.to_datetime(list(indian_holidays.keys()))

    df['is_holiday'] = df['doj'].isin(holiday_timestamps).astype(int)

    df['days_to_next_holiday'] = df['doj'].apply(
        # Use holiday_timestamps instead of indian_holidays
        lambda x: min([(h - x).days for h in holiday_timestamps if (h - x).days >= 0] + [100])
    )
    df['days_from_last_holiday'] = df['doj'].apply(
        # Use holiday_timestamps instead of indian_holidays
        lambda x: min([(x - h).days for h in holiday_timestamps if (x - h).days >= 0] + [100])
    )

    df['day_of_week'] = df['doj'].dt.weekday  # Required again here
    df['is_long_weekend'] = 0
    df.loc[(df['day_of_week'] == 0) & (df['days_from_last_holiday'] == 1), 'is_long_weekend'] = 1
    df.loc[(df['day_of_week'] == 4) & (df['days_to_next_holiday'] == 1), 'is_long_weekend'] = 1

    return df

# Apply feature functions
train_df = add_date_features(train_df)
test_df = add_date_features(test_df)

train_df = add_holiday_features(train_df)
test_df = add_holiday_features(test_df)

# Seat to search ratio
train_df['seat_to_search'] = train_df['cumsum_seatcount'] / (train_df['cumsum_searchcount'] + 1)
test_df['seat_to_search'] = test_df['cumsum_seatcount'] / (test_df['cumsum_searchcount'] + 1)

# Fill NA values
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

# ---------------------- Feature Selection ----------------------

features = [
    'cumsum_seatcount', 'cumsum_searchcount', 'seat_to_search',
    'srcid_tier', 'destid_tier',
    'day_of_week', 'is_weekend', 'month', 'is_month_start', 'is_month_end',
    'is_holiday', 'days_to_next_holiday', 'days_from_last_holiday', 'is_long_weekend'
]

# Encode categorical tier features
for col in ['srcid_tier', 'destid_tier']:
    train_df[col] = train_df[col].astype('category').cat.codes
    test_df[col] = test_df[col].astype('category').cat.codes

# Prepare training data
X = train_df[features]
y = train_df['final_seatcount']

# Train-Test Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
model = XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train, y_train)

# Validation
val_preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f"Validation RMSE: {rmse:.2f}")

# Final Prediction
test_preds = model.predict(test_df[features])

# Submission
submission['final_seatcount'] = test_preds
submission.to_csv('submission.csv', index=False)
print("Submission file created with holiday features.")

Validation RMSE: 697.84
Submission file created with holiday features.


In [None]:
import os
os.listdir()

['.config', 'test.csv', 'train.csv', 'transactions.csv', 'sample_data']