In [18]:
from sklearn.preprocessing import LabelEncoder
import io
from google.colab import files
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

Load Data

In [3]:
uploaded = files.upload()

Saving hotel_data_preprocessed.csv to hotel_data_preprocessed.csv


In [19]:
train_df = pd.read_csv(io.BytesIO(uploaded['hotel_data_preprocessed.csv']))

In [5]:
uploadd = files.upload()

Saving test.csv to test.csv


In [20]:
test_df = pd.read_csv(io.BytesIO(uploadd['test.csv']))

Encode Data

In [21]:
X = train_df.drop(columns=['HotelValue'])
y = train_df['HotelValue']
X, test_df = X.align(test_df, join='left', axis=1, fill_value=0)

In [22]:
# combining train and test for encoding
combined = pd.concat([X, test_df], axis=0)
cat_cols = combined.select_dtypes(exclude=['int', 'float']).columns

# Label encoding categorical columns
for col in cat_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))

# separating back into train and test
X = combined.iloc[:len(X), :]
X_test = combined.iloc[len(X):, :]

Train Model

In [23]:
# splitting into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# impute missing values with median
imputer = SimpleImputer(strategy='median')
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_val = imputer.transform(X_val)
X_test = imputer.transform(X_test)

In [25]:
param_grid = {
    'n_estimators': [300, 500, 700],
    'learning_rate': [0.03, 0.05, 0.07],
    'max_depth': [4, 5, 6],
    'subsample': [0.7, 0.85, 1.0],
    'colsample_bytree': [0.7, 0.85, 1.0],
    'min_child_weight': [1, 3, 5],
    'reg_lambda': [1, 1.5],
    'reg_alpha': [0, 0.1]
}

xgb = XGBRegressor(
    objective='reg:squarederror',
    random_state=42
)

search_xgb = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=30,
    scoring='neg_root_mean_squared_error',
    cv=3,
    random_state=42,
    n_jobs=-1
)

# fit to X_train
search_xgb.fit(X_train, y_train)
best_xgb = search_xgb.best_estimator_
print("Best parameters:", search_xgb.best_params_)

Best parameters: {'subsample': 0.7, 'reg_lambda': 1.5, 'reg_alpha': 0.1, 'n_estimators': 700, 'min_child_weight': 3, 'max_depth': 4, 'learning_rate': 0.03, 'colsample_bytree': 0.85}


In [26]:
# predict on X_val (validation set)
y_val_pred = best_xgb.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Validation RMSE:", rmse)

Validation RMSE: 21668.28047068798


In [27]:
# fit to the entire train.csv
search_xgb.fit(X, y)
new_best_xgb = search_xgb.best_estimator_
print("Best parameters for full data:", search_xgb.best_params_)

Best parameters for full data: {'subsample': 0.7, 'reg_lambda': 1.5, 'reg_alpha': 0.1, 'n_estimators': 700, 'min_child_weight': 3, 'max_depth': 4, 'learning_rate': 0.03, 'colsample_bytree': 0.85}


Predict on test.csv

In [28]:
test_preds_xgb = best_xgb.predict(X_test)

submission_XGB = pd.DataFrame({
    "Id": test_df["Id"],
    "HotelValue": test_preds_xgb
})

submission_XGB.to_csv("submission_XGB.csv", index=False)
print("submission_XGB.csv created successfully!")

submission_XGB.csv created successfully!
