In [9]:
from sklearn.preprocessing import LabelEncoder
import io
from google.colab import files
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

Load Data

In [2]:
uploaded = files.upload()

Saving hotel_data_preprocessed.csv to hotel_data_preprocessed.csv


In [3]:
train_df = pd.read_csv(io.BytesIO(uploaded['hotel_data_preprocessed.csv']))

In [4]:
uploaded = files.upload()

Saving test.csv to test.csv


In [5]:
test_df = pd.read_csv(io.BytesIO(uploaded['test.csv']))

Encode Data

In [6]:
X = train_df.drop(columns=['HotelValue'])
y = train_df['HotelValue']
X, test_df = X.align(test_df, join='left', axis=1, fill_value=0)

In [7]:
# combining train and test for encoding
combined = pd.concat([X, test_df], axis=0)
cat_cols = combined.select_dtypes(exclude=['int', 'float']).columns

# Label encoding categorical columns
for col in cat_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))

# separating back into train and test
X = combined.iloc[:len(X), :]
X_test = combined.iloc[len(X):, :]

Train Model

In [8]:
# splitting into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# impute missing values with median
imputer = SimpleImputer(strategy='median')
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_val = imputer.transform(X_val)
X_test = imputer.transform(X_test)

In [11]:
tree = DecisionTreeRegressor(random_state=42)

param_grid = {
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 0.7],
}

# hyperparameter tuning
search = RandomizedSearchCV(
    tree,
    param_distributions=param_grid,
    n_iter=25,
    scoring='neg_root_mean_squared_error',
    cv=5,
    random_state=42,
    n_jobs=-1
)

# fit to X_train
search.fit(X_train, y_train)
best_tree = search.best_estimator_
print("Best params:", search.best_params_)

Best params: {'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': None, 'max_depth': 15}


In [12]:
# predict on X_val (validation set)
y_pred = best_tree.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print("Validation RMSE:", rmse)

Validation RMSE: 30404.06985307349


In [13]:
# fit to the entire train.csv
search.fit(X, y)
new_best_tree = search.best_estimator_
print("Best parameters for full data:", search.best_params_)

Best parameters for full data: {'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 0.7, 'max_depth': 20}


Predict on test.csv

In [14]:
test_preds = best_tree.predict(X_test)

submission = pd.DataFrame({
    "Id": test_df["Id"],
    "HotelValue": test_preds
})

submission.to_csv("submission_DT.csv", index=False)
print("submission_DT.csv created successfully!")

submission_DT.csv created successfully!
