In [1]:
from sklearn.preprocessing import LabelEncoder
import io
from google.colab import files
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

Load Data

In [2]:
uploaded = files.upload()

Saving hotel_data_preprocessed.csv to hotel_data_preprocessed.csv


In [3]:
train_df = pd.read_csv(io.BytesIO(uploaded['hotel_data_preprocessed.csv']))

In [4]:
uploadd = files.upload()

Saving test.csv to test.csv


In [5]:
test_df = pd.read_csv(io.BytesIO(uploadd['test.csv']))

Encode Data

In [6]:
X = train_df.drop(columns=['HotelValue'])
y = train_df['HotelValue']
X, test_df = X.align(test_df, join='left', axis=1, fill_value=0)

In [7]:
# combining train and test for encoding
combined = pd.concat([X, test_df], axis=0)
cat_cols = combined.select_dtypes(exclude=['int', 'float']).columns

# Label encoding categorical columns
for col in cat_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))

# separating back into train and test
X = combined.iloc[:len(X), :]
X_test = combined.iloc[len(X):, :]

Train the Model

In [8]:
# splitting into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# impute missing values with median
imputer = SimpleImputer(strategy='median')
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_val = imputer.transform(X_val)
X_test = imputer.transform(X_test)

In [10]:
param_grid = {
    'n_estimators': [200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.7, 0.8, 1.0],
    'max_features': ['sqrt', 'log2', None]
}

gbr = GradientBoostingRegressor(random_state=42)

# hyperparameter tuning
search_gbr = RandomizedSearchCV(
    gbr,
    param_distributions=param_grid,
    n_iter=30,
    cv=3,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

# fit to X_train
search_gbr.fit(X_train, y_train)
best_gbr = search_gbr.best_estimator_
print("Best parameters:", search_gbr.best_params_)

Best parameters: {'subsample': 1.0, 'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 3, 'learning_rate': 0.1}


In [11]:
# predict on X_val (validation set)
y_val_pred = best_gbr.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Validation RMSE:", rmse)

Validation RMSE: 20716.458595750348


In [12]:
# fit to the entire train.csv
search_gbr.fit(X, y)
new_best_rf = search_gbr.best_estimator_
print("Best parameters for full data:", search_gbr.best_params_)

Best parameters for full data: {'subsample': 0.7, 'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 5, 'learning_rate': 0.05}


Predict on test.csv

In [13]:
test_preds_gbr = best_gbr.predict(X_test)

submission_GBR = pd.DataFrame({
    "Id": test_df["Id"],
    "HotelValue": test_preds_gbr
})

submission_GBR.to_csv("submission_GBR.csv", index=False)
print("submission_GBR.csv created successfully!")

submission_GBR.csv created successfully!
