In [2]:
# Import Libraries

import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
import time
import joblib

In [4]:
# Load train/test sets

X_train = pd.read_csv("train_test_split/X_train.csv")
Y_train = pd.read_csv("train_test_split/Y_train.csv")

X_test = pd.read_csv("train_test_split/X_test.csv")
Y_test = pd.read_csv("train_test_split/Y_test.csv")

print("Data loaded successfully!")

Data loaded successfully!


In [9]:
# Drop original datetime column
X_train = X_train.drop(columns=["date"])
X_test = X_test.drop(columns=["date"])


In [10]:
# Convert Y to 1D array
Y_train_array = Y_train.values.ravel()
Y_test_array = Y_test.values.ravel()


In [11]:
categorical_features = ['city_name']  # list of categorical columns
cat_feature_indices = [X_train.columns.get_loc(col) for col in categorical_features]

In [13]:
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=7,
    loss_function='RMSE',
    verbose=100,
    random_state=42
)

model.fit(X_train, Y_train_array, cat_features=cat_feature_indices)


0:	learn: 10.5674086	total: 266ms	remaining: 2m 12s
100:	learn: 5.0654100	total: 8.57s	remaining: 33.8s
200:	learn: 4.7658939	total: 17.2s	remaining: 25.6s
300:	learn: 4.5690631	total: 25.9s	remaining: 17.1s
400:	learn: 4.4282321	total: 34.6s	remaining: 8.53s
499:	learn: 4.3120501	total: 43.2s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x24ef7c43a50>

In [14]:
# Train the Model

print("\nTraining CatBoostRegressor model...")
start_time = time.time()

model.fit(X_train, Y_train_array, cat_features=cat_feature_indices)

end_time = time.time()
print("\nModel trained successfully!")
print(f"Training completed in {end_time - start_time:.2f} seconds.")


Training CatBoostRegressor model...
0:	learn: 10.5674086	total: 93.1ms	remaining: 46.5s
100:	learn: 5.0654100	total: 8.32s	remaining: 32.9s
200:	learn: 4.7658939	total: 16.6s	remaining: 24.6s
300:	learn: 4.5690631	total: 25.3s	remaining: 16.7s
400:	learn: 4.4282321	total: 35.1s	remaining: 8.67s
499:	learn: 4.3120501	total: 44s	remaining: 0us

Model trained successfully!
Training completed in 44.11 seconds.


In [20]:

# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Compute Metrics
def print_metrics(y_true, y_pred, dataset_name="Dataset"):
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_true, y_pred)
    print(f"\nPerformance on {dataset_name}:")
    print(f"RMSE : {rmse:.4f}")
    print(f"R² Score : {r2:.4f}")

# Train performance
print_metrics(Y_train_array, y_train_pred, "Train Set")

# Test performance
print_metrics(Y_test_array, y_test_pred, "Test Set")




Performance on Train Set:
RMSE : 4.3122
R² Score : 0.8441

Performance on Test Set:
RMSE : 4.5892
R² Score : 0.7679


In [23]:
model_path = "catboost_regressor_model.pkl"

# Save model to pickle file
joblib.dump(model, model_path)

print(f"\nModel saved successfully at: {model_path}")



Model saved successfully at: catboost_regressor_model.pkl
