In [None]:
from sklearn.preprocessing import LabelEncoder
import io
from google.colab import files
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

Load Data

In [None]:
uploaded = files.upload()

Saving hotel_data_preprocessed.csv to hotel_data_preprocessed.csv


In [None]:
train_df = pd.read_csv(io.BytesIO(uploaded['hotel_data_preprocessed.csv']))

In [None]:
uploaded = files.upload()

Saving test.csv to test.csv


In [None]:
test_df = pd.read_csv(io.BytesIO(uploaded['test.csv']))

Encode Data

In [None]:
print("Train shape:", train_df.shape)
print("Test shape: ", test_df.shape)

Train shape: (1104, 75)
Test shape:  (260, 80)


In [None]:
X = train_df.drop(columns=['HotelValue'])
y = train_df['HotelValue']
X, test_df = X.align(test_df, join='left', axis=1, fill_value=0)

In [None]:
print("Train shape:", train_df.shape)
print("Test shape: ", test_df.shape)

Train shape: (1104, 75)
Test shape:  (260, 74)


In [None]:
# combining train and test for encoding
combined = pd.concat([X, test_df], axis=0)
cat_cols = combined.select_dtypes(exclude=['int', 'float']).columns

# Label encoding categorical columns
for col in cat_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))

# separating back into train and test
X = combined.iloc[:len(X), :]
X_test = combined.iloc[len(X):, :]

Train the Model

In [None]:
# splitting into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)

# hyperparameter tuning
search = RandomizedSearchCV(
    rf,
    param_distributions=param_grid,
    n_iter=30,
    cv=3,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

# fit to X_train
search.fit(X_train, y_train)
best_rf = search.best_estimator_

print("Best parameters:", search.best_params_)

Best parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None}


In [None]:
# predict on X_val (validation set)
y_val_pred = best_rf.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Validation RMSE:", rmse_val)

Validation RMSE: 24787.877527621004


In [None]:
# fit to the entire train.csv
search.fit(X, y)
new_best_rf = search.best_estimator_
print("Best parameters for full data:", search.best_params_)

Best parameters for full data: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None}


Predict on Test.csv

In [None]:
test_preds_RF = best_rf.predict(X_test)

submission_RF = pd.DataFrame({
    "Id": test_df["Id"],
    "HotelValue": test_preds_RF
})

submission_RF.to_csv("submission_RF.csv", index=False)
print("submission_RF.csv created successfully!")

submission_RF.csv created successfully!
