In [2]:
from sklearn.preprocessing import LabelEncoder
import io
from google.colab import files
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

Load Dataset

In [3]:
uploaded = files.upload()

Saving hotel_data_preprocessed.csv to hotel_data_preprocessed.csv


In [4]:
train_df = pd.read_csv(io.BytesIO(uploaded['hotel_data_preprocessed.csv']))

In [5]:
uploadd = files.upload()

Saving test.csv to test.csv


In [6]:
test_df = pd.read_csv(io.BytesIO(uploadd['test.csv']))

Encode Data

In [7]:
X = train_df.drop(columns=['HotelValue'])
y = train_df['HotelValue']
X, test_df = X.align(test_df, join='left', axis=1, fill_value=0)

In [8]:
# combining train and test for encoding
combined = pd.concat([X, test_df], axis=0)
cat_cols = combined.select_dtypes(exclude=['int', 'float']).columns

# Label encoding categorical columns
for col in cat_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))

# separating back into train and test
X = combined.iloc[:len(X), :]
X_test = combined.iloc[len(X):, :]

Train Model

In [9]:
# splitting into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# impute missing values with median
imputer = SimpleImputer(strategy='median')
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_val = imputer.transform(X_val)
X_test = imputer.transform(X_test)

In [12]:
base_estimator = DecisionTreeRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'estimator__max_depth': [2, 3, 4],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4]
}

ada = AdaBoostRegressor(
    estimator=base_estimator,
    random_state=42
)

search_ada = RandomizedSearchCV(
    ada,
    param_distributions=param_grid,
    n_iter=20,
    cv=3,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

# fit on X_train
search_ada.fit(X_train, y_train)
best_ada = search_ada.best_estimator_
print("Best parameters:", search_ada.best_params_)

Best parameters: {'n_estimators': 300, 'learning_rate': 0.2, 'estimator__min_samples_split': 10, 'estimator__min_samples_leaf': 2, 'estimator__max_depth': 4}


In [13]:
# predict on X_val (validation set)
y_val_pred = best_ada.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Validation RMSE:", rmse)

Validation RMSE: 25289.239540544582


In [14]:
# fit to the entire train.csv
search_ada.fit(X, y)
best_full_ada = search_ada.best_estimator_
print("Best parameters for full data:", search_ada.best_params_)

Best parameters for full data: {'n_estimators': 300, 'learning_rate': 0.2, 'estimator__min_samples_split': 2, 'estimator__min_samples_leaf': 1, 'estimator__max_depth': 4}


Predict on test.csv

In [15]:
test_preds_ada = best_ada.predict(X_test)

submission_ADA = pd.DataFrame({
    "Id": test_df["Id"],
    "HotelValue": test_preds_ada
})

submission_ADA.to_csv("submission_ADA.csv", index=False)
print("submission_ADA.csv created successfully!")

submission_ADA.csv created successfully!
