In [22]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline


In [3]:
# Load data
X_train = pd.read_csv("train_test_split/X_train.csv")
Y_train = pd.read_csv("train_test_split/Y_train.csv")

X_test = pd.read_csv("train_test_split/X_test.csv")
Y_test = pd.read_csv("train_test_split/Y_test.csv")

# Convert Y to 1D
Y_train = Y_train.values.ravel()
Y_test = Y_test.values.ravel()

In [13]:
# Feature Engineering for date
for df in [X_train, X_test]:
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday
    df.drop(columns=['date'], inplace=True)

In [14]:
# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [18]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        # 1. Categorical: Apply OneHotEncoder
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        # 2. Numerical: Explicitly pass numerical columns through (or apply scaler)
        ("num", "passthrough", numeric_cols)
    ],
    remainder="drop"
)

In [23]:
# Initialize Model
gbr_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", GradientBoostingRegressor(
        n_estimators=100,
        learning_rate=0.1,  # Example parameter
        random_state=42
    ))
])

In [25]:
# Train Model
print("Training Gradient Boosting Regressor...")
gbr_model.fit(X_train, Y_train)

Training Gradient Boosting Regressor...


In [27]:
# Predictions
y_pred_train = gbr_model.predict(X_train)
y_pred_test = gbr_model.predict(X_test)

In [28]:
# Evaluation
rmse_train = np.sqrt(mean_squared_error(Y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(Y_test, y_pred_test))
r2_train = r2_score(Y_train, y_pred_train)
r2_test = r2_score(Y_test, y_pred_test)

print("\nTrain Performance:")
print(f"RMSE: {rmse_train:.4f}")
print(f"R² Score: {r2_train:.4f}")

print("\nTest Performance:")
print(f"RMSE: {rmse_test:.4f}")
print(f"R² Score: {r2_test:.4f}")


Train Performance:
RMSE: 7.8407
R² Score: 0.4809

Test Performance:
RMSE: 7.7169
R² Score: 0.4971


In [None]:
import joblib
joblib.dump(gbr_model, "gradient_boosting_regressor_model.pkl")
print("\nModel saved as: gradient_boosting_regressor_model.pkl")


Model saved as: gradient_boosting_regressor_model.pkl
