In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import BayesianRidge
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from google.colab import files


In [2]:
# Upload Files
print("Please upload your train.csv file:")
uploaded_train = files.upload()

print("Please upload your test.csv file:")
uploaded_test = files.upload()

train_filename = list(uploaded_train.keys())[0]
test_filename = list(uploaded_test.keys())[0]

Please upload your train.csv file:


Saving train_preprocessed.csv to train_preprocessed.csv
Please upload your test.csv file:


Saving test.csv to test.csv


In [3]:
# Load Data
train_df = pd.read_csv(train_filename)
test_df = pd.read_csv(test_filename)

print("\n Files uploaded successfully!")
print("Training data shape:", train_df.shape)
print("Testing data shape:", test_df.shape)


 Files uploaded successfully!
Training data shape: (1104, 75)
Testing data shape: (260, 80)


In [4]:
# Handle ID and Target
target_col = "HotelValue"
test_ids = test_df["Id"]

train_df = train_df.drop(columns=["Id"])
test_df = test_df.drop(columns=["Id"])

X = train_df.drop(columns=[target_col])
y = train_df[target_col]
X_test = test_df.copy()

In [5]:
# Target transformation & outlier clipping

y = np.clip(y, np.percentile(y, 1), np.percentile(y, 99))
y_log = np.log1p(y)

In [6]:
# Identify Numeric and Categorical Columns

num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

print(f"\nNumeric columns: {len(num_cols)}")
print(f"Categorical columns: {len(cat_cols)}")


Numeric columns: 36
Categorical columns: 37


In [7]:
# Preprocessing Pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
    ('scaler', StandardScaler(with_mean=True, with_std=True)),
    ('power', PowerTransformer(method='yeo-johnson', standardize=True))
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])


In [8]:
# Split Data for Validation
X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.2, random_state=42)


In [9]:
# Define Bayesian Ridge + Feature Selection
feature_selector = SelectFromModel(BayesianRidge(), threshold="median")

bayesian = Pipeline([
    ('preprocessor', preprocessor),
    ('select', feature_selector),
    ('regressor', BayesianRidge(compute_score=True, fit_intercept=True))
])

In [10]:
# Wider hyperparameter tuning
param_grid = {
    'regressor__alpha_1': [1e-8, 1e-7, 1e-6, 1e-5],
    'regressor__alpha_2': [1e-8, 1e-7, 1e-6, 1e-5],
    'regressor__lambda_1': [1e-8, 1e-7, 1e-6, 1e-5],
    'regressor__lambda_2': [1e-8, 1e-7, 1e-6, 1e-5],
}

grid_search = GridSearchCV(
    bayesian,
    param_grid,
    cv=10,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

In [11]:
# Train Model with Grid Search

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("\n Best Parameters Found:")
print(grid_search.best_params_)

Fitting 10 folds for each of 256 candidates, totalling 2560 fits

 Best Parameters Found:
{'regressor__alpha_1': 1e-08, 'regressor__alpha_2': 1e-05, 'regressor__lambda_1': 1e-08, 'regressor__lambda_2': 1e-05}


In [12]:
# Cross-Validation on Full Data

cv = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(best_model, X, y_log, cv=cv, scoring='r2', n_jobs=-1)

print(f"\n Cross-Validation Mean R²: {cv_scores.mean():.5f} ± {cv_scores.std():.5f}")



 Cross-Validation Mean R²: 0.87816 ± 0.04555


In [13]:
# Validation Performance
y_val_pred = best_model.predict(X_val)
r2 = r2_score(y_val, y_val_pred)
rmse = np.sqrt(mean_squared_error(np.expm1(y_val), np.expm1(y_val_pred)))

print(f"\n Validation Performance:")
print(f"R² Score: {r2:.5f}")
print(f"RMSE: {rmse:.3f}")


 Validation Performance:
R² Score: 0.85519
RMSE: 25667.677


In [14]:
# Retrain on Full Dataset
best_model.fit(X, y_log)

In [15]:
# Predict on Test Data
test_preds_log = best_model.predict(X_test)
test_preds = np.expm1(test_preds_log)
test_preds = np.clip(test_preds, 0, None)

In [16]:
# Save Final Predictions
output = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": np.round(test_preds, 2)
})

output.to_csv("hotel_predictions_bayesian.csv", index=False)
print("\n✅ Predictions saved to 'hotel_predictions_bayesian.csv'")

files.download("hotel_predictions_bayesian.csv")


✅ Predictions saved to 'hotel_predictions_bayesian.csv'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>