In [None]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import numpy as np
import time

In [4]:
# Load train/test sets
X_train = pd.read_csv("train_test_split/X_train.csv")
Y_train = pd.read_csv("train_test_split/Y_train.csv")

X_test = pd.read_csv("train_test_split/X_test.csv")
Y_test = pd.read_csv("train_test_split/Y_test.csv")

print(" Data loaded successfully!")

 Data loaded successfully!


In [21]:
# Convert Y to 1D array for XGBoost
Y_train_array = Y_train.values.ravel()
Y_test_array = Y_test.values.ravel()

In [24]:
categorical_features = ['city_name']
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()

In [25]:
preprocessor = ColumnTransformer(
    transformers=[
        # 1. Categorical: Apply OneHotEncoder to the city name
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        # 2. Numerical: Pass through all numerical columns
        ('num', 'passthrough', numeric_features)
    ],
    remainder='drop' # Drops any unselected columns (like 'date')
)

In [None]:
xgb_regressor = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1,
    random_state=42,
    n_jobs=-1
)

In [27]:
# Create the full pipeline: Preprocessor -> XGBRegressor
xgb_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb_regressor)
])

In [None]:
print("\nTraining XGBRegressor model...")
start_time = time.time()

# Fit the pipeline on the raw X_train data. The pipeline handles all transformations internally.
xgb_model_pipeline.fit(X_train, Y_train_array)

end_time = time.time()
print("Model is trained Successfully! ")
print(f"Training completed in {end_time - start_time:.2f} seconds.")


Training XGBRegressor model...
Model is trained Successfully! 
Training completed in 4.40 seconds.


In [38]:
# Evaluate the Model
print("\nModel Evaluation on Test Set")
y_pred = xgb_model_pipeline.predict(X_test)

# Evaluation
mse = mean_squared_error(Y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(Y_test, y_pred)

print("\n Model Performance")
print(f"RMSE : {rmse:.4f}")
print(f"R² Score : {r2:.4f}")


Model Evaluation on Test Set

 Model Performance
RMSE : 7.2457
R² Score : 0.5566


In [None]:
import joblib
joblib.dump(xgb_model_pipeline, "xgb_regressor_model.pkl")
print("\nModel saved as: xgb_regressor_model.pkl")


Model saved as: xgb_regressor_model.pkl
