In [5]:
# ================== Imports ==================
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")


In [13]:
# ================== Data Loading ==================
train = pd.read_csv("/kaggle/input/redbus-dataset/train-zip/train/train.csv", parse_dates=["doj"])
transactions = pd.read_csv("/kaggle/input/redbus-dataset/train-zip/train/transactions.csv", parse_dates=["doj", "doi"])
test = pd.read_csv("/kaggle/input/redbus-dataset/test.csv", parse_dates=["doj"])
holidays_df = pd.read_csv("/kaggle/input/holiday-23-25/holiday_dates.csv")  # Has columns: start_date, end_date, holiday

# ================== Holiday Preprocessing ==================
# Define date parser to ensure correct conversion
date_parser = lambda x: pd.to_datetime(x, format='%m/%d/%Y', errors='coerce')

# Apply date parser to 'Start' and 'End' columns
holidays_df['Start'] = pd.to_datetime(holidays_df['Start'], format='%m/%d/%Y', errors='coerce')
holidays_df['End'] = pd.to_datetime(holidays_df['End'], format='%m/%d/%Y', errors='coerce')

# Remove rows with invalid dates
holidays_df = holidays_df[(holidays_df['Start'].notna()) & (holidays_df['End'].notna())]

# Initialize holiday ranges list
holiday_ranges = []

# Iterate over holiday dates and create date ranges
for _, row in holidays_df.iterrows():
    start = row['Start']
    end = row['End']
    holiday_ranges.extend(pd.date_range(start, end).tolist())

# Create holiday dates DataFrame
holiday_dates = pd.DataFrame({"date": holiday_ranges})

# Group by 'date' and count occurrences
holiday_counts = holiday_dates.groupby("date").size().reset_index(name="holiday_count")

# ================== Merge Data ==================
transactions["doj"] = pd.to_datetime(transactions["doj"])
db15 = transactions[transactions["dbd"] == 15]
train_merged = train.merge(db15, on=["doj", "srcid", "destid"], how="left")
test_merged = test.merge(db15, on=["doj", "srcid", "destid"], how="left")

# ================== Feature Engineering ==================
def add_features(df):
    df["doj_dayofweek"] = df["doj"].dt.dayofweek
    df["doj_month"] = df["doj"].dt.month
    df["doj_day"] = df["doj"].dt.day
    df["is_weekend"] = df["doj_dayofweek"].isin([5, 6]).astype(int)
    df["search_per_seat"] = df["cumsum_searchcount"] / (df["cumsum_seatcount"] + 1)
    df["route_id"] = df["srcid"] * 10000 + df["destid"]
    return df

train_merged = add_features(train_merged)
test_merged = add_features(test_merged)

# Add holiday features
for df in [train_merged, test_merged]:
    df["doj_date"] = pd.to_datetime(df["doj"].dt.date)

train_merged = train_merged.merge(holiday_counts, left_on="doj_date", right_on="date", how="left")
test_merged = test_merged.merge(holiday_counts, left_on="doj_date", right_on="date", how="left")

for df in [train_merged, test_merged]:
    df["is_holiday"] = df["holiday_count"].notna().astype(int)
    df["holiday_count"] = df["holiday_count"].fillna(0)
    df.drop(columns=["doj_date", "date"], inplace=True)

route_stats = train_merged.groupby("route_id")["final_seatcount"].agg(["mean", "count", "median"]).reset_index()
route_stats.columns = ["route_id", "route_avg_seatcount", "route_freq", "route_median"]
train_merged = train_merged.merge(route_stats, on="route_id", how="left")
test_merged = test_merged.merge(route_stats, on="route_id", how="left")

train_merged["search_x_weekend"] = train_merged["search_per_seat"] * train_merged["is_weekend"]
test_merged["search_x_weekend"] = test_merged["search_per_seat"] * test_merged["is_weekend"]

train_merged["is_month_end"] = train_merged["doj"].dt.is_month_end.astype(int)
test_merged["is_month_end"] = test_merged["doj"].dt.is_month_end.astype(int)
train_merged["days_to_weekend"] = 6 - train_merged["doj_dayofweek"]
test_merged["days_to_weekend"] = 6 - test_merged["doj_dayofweek"]

train_merged["search_to_seat_ratio"] = train_merged["cumsum_searchcount"] / (train_merged["cumsum_seatcount"] + 1)
test_merged["search_to_seat_ratio"] = test_merged["cumsum_searchcount"] / (test_merged["cumsum_seatcount"] + 1)

train_merged["is_same_region"] = (train_merged["srcid_region"] == train_merged["destid_region"]).astype(int)
test_merged["is_same_region"] = (test_merged["srcid_region"] == test_merged["destid_region"]).astype(int)

cat_cols = ["srcid_region", "destid_region", "srcid_tier", "destid_tier"]
for col in cat_cols:
    le = LabelEncoder()
    train_merged[col] = le.fit_transform(train_merged[col].astype(str))
    test_merged[col] = le.transform(test_merged[col].astype(str))

train_merged["route_tier_combo"] = train_merged["srcid_tier"] * 10 + train_merged["destid_tier"]
test_merged["route_tier_combo"] = test_merged["srcid_tier"] * 10 + test_merged["destid_tier"]

route_freq_rank = train_merged.groupby("route_id")["final_seatcount"].count().rank(method="min", ascending=False).to_dict()
train_merged["route_freq_rank"] = train_merged["route_id"].map(route_freq_rank)
test_merged["route_freq_rank"] = test_merged["route_id"].map(route_freq_rank)

skew_feature = (
    train_merged.groupby(["route_id", "doj_dayofweek"])["final_seatcount"]
 .skew()
 .reset_index()
 .rename(columns={"final_seatcount": "route_dow_final_seatcount_skew"})
)
train_merged = train_merged.merge(skew_feature, on=["route_id", "doj_dayofweek"], how="left")
test_merged = test_merged.merge(skew_feature, on=["route_id", "doj_dayofweek"], how="left")

In [14]:
# ================== Modeling ==================
train_merged = train_merged.sort_values("doj")
split_idx = int(len(train_merged) * 0.85)

features = [
    "doj_dayofweek", "doj_month", "doj_day", "is_weekend", "cumsum_seatcount", "cumsum_searchcount",
    "search_per_seat", "srcid_region", "destid_region", "srcid_tier", "destid_tier", "route_id",
    "route_avg_seatcount", "route_freq", "search_x_weekend", "route_median", "is_month_end",
    "days_to_weekend", "search_to_seat_ratio", "route_tier_combo", "is_same_region",
    "route_freq_rank", "route_dow_final_seatcount_skew", "is_holiday", "holiday_count"
]

X_train = train_merged.iloc[:split_idx][features]
y_train = train_merged.iloc[:split_idx]["final_seatcount"]
X_val = train_merged.iloc[split_idx:][features]
y_val = train_merged.iloc[split_idx:]["final_seatcount"]
X_test = test_merged[features]

params_lgb = {
    "objective": "regression", "metric": "rmse", "verbosity": -1,
    "boosting_type": "gbdt", "learning_rate": 0.03,
    "num_leaves": 50, "feature_fraction": 0.9,
    "bagging_fraction": 0.8, "bagging_freq": 5, "seed": 42
}
params_xgb = {
    "objective": "reg:squarederror", "eval_metric": "rmse",
    "learning_rate": 0.03, "max_depth": 7,
    "subsample": 0.8, "colsample_bytree": 0.9, "seed": 42
}

lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val, y_val)
model_lgb = lgb.train(params_lgb, lgb_train, num_boost_round=2000, valid_sets=[lgb_val], callbacks=[lgb.early_stopping(50)])
preds_lgb_val = model_lgb.predict(X_val)
preds_lgb_test = model_lgb.predict(X_test)

model_xgb = xgb.XGBRegressor(**params_xgb, n_estimators=2000)
model_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False)
preds_xgb_val = model_xgb.predict(X_val)
preds_xgb_test = model_xgb.predict(X_test)

stk_train = pd.DataFrame({"lgb": preds_lgb_val, "xgb": preds_xgb_val})
stk_test = pd.DataFrame({"lgb": preds_lgb_test, "xgb": preds_xgb_test})
meta = Ridge(alpha=1.0)
meta.fit(stk_train, y_val)
meta_val_preds = meta.predict(stk_train)
meta_test_preds = meta.predict(stk_test)

val_rmse = mean_squared_error(y_val, meta_val_preds, squared=False)
print("Validation RMSE (Blended):", val_rmse)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[416]	valid_0's rmse: 632.864
Validation RMSE (Blended): 558.2507153263341


In [15]:
# ================== Save Submission ==================
submission = test[["route_key"]].copy()
submission["final_seatcount"] = np.round(np.clip(meta_test_preds, 0, None)).astype(int)
submission.to_csv("submission_with_holidays.csv", index=False)
