In [None]:
!pip install xgboost
!pip install -U scikit-learn

In [None]:
#import all dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor

from sklearn.model_selection import GridSearchCV




#from sklearn.metrics import mean_squared_error

# **Data Pre Processing** #


In [None]:
df = pd.read_csv('Data/bookings_train.csv')

In [None]:
df. head(10)

Unnamed: 0,booking_id,citizen_id,booking_date,appointment_date,appointment_time,check_in_time,check_out_time,task_id,num_documents,queue_number,satisfaction_rating
0,f7b59ba3793fea61282cb02be2247cebd0a75306,4229044420,2021-01-01,2021-01-01,09:03,2021-01-01 09:11:00,2021-01-01 09:48:15.166353269,TASK-002,0,2,4
1,cfe30d53b4b645e4cb6b114f6c9d74a0f25e6a80,5389442635,2021-01-01,2021-01-01,09:12,2021-01-01 09:24:00,2021-01-01 10:24:12.189261137,TASK-001,1,1,4
2,0fb0a5d9b3944dd743a9e7828dc03fa74aa6ebf8,2123783919,2021-01-01,2021-01-01,09:36,2021-01-01 09:29:00,2021-01-01 10:26:48.802260864,TASK-002,1,6,5
3,d4bf826153212fa325ad98b215668ab942365607,6359918966,2021-01-01,2021-01-01,09:45,2021-01-01 10:07:00,2021-01-01 11:00:13.485642822,TASK-001,1,4,4
4,ebb4c9687acbaf65376d8e379bd1341d1c15127e,178732202,2021-01-01,2021-01-01,10:12,2021-01-01 10:26:00,2021-01-01 11:54:53.260180213,TASK-002,1,8,3


In [None]:
df. info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203693 entries, 0 to 203692
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   booking_id           203693 non-null  object
 1   citizen_id           203693 non-null  int64 
 2   booking_date         203693 non-null  object
 3   appointment_date     203693 non-null  object
 4   appointment_time     203693 non-null  object
 5   check_in_time        197601 non-null  object
 6   check_out_time       197601 non-null  object
 7   task_id              203693 non-null  object
 8   num_documents        203693 non-null  int64 
 9   queue_number         203693 non-null  int64 
 10  satisfaction_rating  203693 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 17.1+ MB


In [None]:
# Converting into Datetime
df['booking_date'] = pd.to_datetime(df['booking_date'])
df['appointment_date'] = pd.to_datetime(df['appointment_date'])
df['appointment_time'] = pd.to_datetime(df['appointment_time'], format='%H:%M').dt.time
df['check_in_time'] = pd.to_datetime(df['check_in_time'], errors='coerce')
df['check_out_time'] = pd.to_datetime(df['check_out_time'], errors='coerce')

In [None]:
#this is not directly used in the model, but can use in future
df['day_of_week'] = df['appointment_date'].dt.dayofweek        # 0=Mon, 6=Sun
df['month'] = df['appointment_date'].dt.month
df['year'] = df['appointment_date'].dt.year
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# **Feature Engineering** #

In [None]:
df['service_duration'] = (df['check_out_time'] - df['check_in_time']).dt.total_seconds() / 60.0

In [None]:
# Appointment hour (numeric)
df['appointment_hour'] = pd.to_datetime(df['appointment_time'].astype(str), format='%H:%M:%S', errors='coerce').dt.hour

In [None]:
# Waiting time in minutes #Note: For future enhancement
df['waiting_time'] = (
    df['check_in_time'] -
    pd.to_datetime(df['appointment_date'].astype(str) + ' ' + df['appointment_time'].astype(str), errors='coerce')
).dt.total_seconds() / 60.0

In [None]:
# Encoding to Task ID
df['task_id_encoded'] = df['task_id'].astype('category').cat.codes

In [None]:
# Morning/Afternoon/Evening session
def get_session(hour):
    if 6 <= hour < 12: return "morning"
    elif 12 <= hour < 17: return "afternoon"
    elif 17 <= hour < 21: return "evening"
    else: return "off_hours"

df['session'] = df['appointment_hour'].apply(get_session)

In [None]:
# Peak hour flag (e.g., 9–12 and 14–16 are busy hours)
df['is_peak_hour'] = df['appointment_hour'].between(9, 12).astype(int) | df['appointment_hour'].between(14, 16).astype(int)

# **Explotary Data Analysis / Data cleaning** #

In [None]:
df.isnull().sum()

booking_id                0
citizen_id                0
booking_date              0
appointment_date          0
appointment_time          0
check_in_time          6092
check_out_time         6092
task_id                   0
num_documents             0
queue_number              0
satisfaction_rating       0
day_of_week               0
month                     0
year                      0
is_weekend                0
service_duration       6092
task_id_encoded           0
dtype: int64

In [None]:
df = df.dropna(subset=['check_in_time', 'check_out_time', 'service_duration']).reset_index(drop=True)

# **Model Training** #

In [None]:
df.info()

In [None]:
# Define features (X) and target (y)

feature_cols = [
    'task_id_encoded', 'day_of_week', 'month', 'year', 'is_weekend', 'appointment_hour', 'is_peak_hour',
    'num_documents', 'queue_number'
]

X = df[feature_cols]
y = df['service_duration']   # Target variable

In [None]:
print("\nColumns used for features:")
print(feature_cols)

In [None]:
print("\nTarget column stats:")
print(y.describe())

In [None]:
#Train/Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

In [None]:
# Initialize XGBoost Model

xgb_model = XGBRegressor(
    n_estimators=500,       # number of boosting rounds (trees)
    learning_rate=0.05,     # step size shrinkage
    max_depth=6,            # maximum tree depth
    subsample=0.8,          # row sampling
    colsample_bytree=0.8,   # feature sampling
    random_state=42,
    n_jobs=-1
)

In [None]:
# Base model
xgb = XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)

# Parameter grid
param_grid = {
    'n_estimators': [300, 500, 800],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# Grid Search
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='r2',
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best Params:", grid_search.best_params_)
print("Best CV R²:", grid_search.best_score_)

best_model = grid_search.best_estimator_

In [None]:
#Train the model

xgb_model.fit(X_train, y_train)

In [None]:
#Make Predictions from test dataset

y_pred = xgb_model.predict(X_test)

In [None]:
print("First 10 predictions vs actual values:")
results = pd.DataFrame({
    "Actual": y_test.iloc[:10].values,
    "Predicted": y_pred[:10]
})
print(results)


In [None]:
plt.figure(figsize=(8,5))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color="red", linewidth=2)
plt.xlabel("Actual Service Duration (minutes)")
plt.ylabel("Predicted Service Duration (minutes)")
plt.title("Predicted vs Actual Service Duration")
plt.show()


# **Model Evaluation** #

In [None]:
#We have included 4 Evaluation methods
#mse = mean_squared_error(y_test, y_pred)
#rmse = np.sqrt(mse)   # Root Mean Squared Error
#mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


In [None]:
print("Model Performance on Test Data:")
#print(f"RMSE: {rmse:.2f} minutes")
#print(f"MAE: {mae:.2f} minutes")
print(f"R² Score: {r2:.4f}")