### Example:
- Which models use for training
- Comparison of selected models
- How is model performance

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

# Load the merged dataset
processed_df = pd.read_csv('merged_tourism_data.csv')

# Log transform 'price' to reduce skewness
processed_df['log_price'] = np.log1p(processed_df['price'])

# Drop original price column and keep log_price as target
processed_df.drop(columns=['price'], inplace=True)

# One-hot encode categorical variables
processed_df_encoded = pd.get_dummies(processed_df, drop_first=True)

# Feature and target variables
X = processed_df_encoded.drop('log_price', axis=1)  # Features
y = processed_df_encoded['log_price']  # Target (log-transformed price)

# Apply feature scaling to numerical features (e.g., 'Tourists')
numerical_features = ['Tourists']  # You can add more features if necessary
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameter grid for tuning
param_dist = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 1.0]
}

# Initialize the XGBoost model
xgboost_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Perform RandomizedSearchCV to find best parameters
random_search = RandomizedSearchCV(
    estimator=xgboost_model, param_distributions=param_dist,
    n_iter=100,  # Testing 100 random combinations
    cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1, random_state=42
)

# Fit the randomized search to a sample of the dataset to speed up tuning
sample_df = processed_df_encoded.sample(n=20000, random_state=42)
X_sample = sample_df.drop('log_price', axis=1)
y_sample = sample_df['log_price']
random_search.fit(X_sample, y_sample)

# Print best parameters
print("Best Parameters:", random_search.best_params_)

# Reinitialize the XGBoost model with the best found parameters
best_xgboost = XGBRegressor(**random_search.best_params_, objective='reg:squarederror', random_state=42)

# Use cross-validation to tune the model (this replaces RandomizedSearchCV)
cv_scores = cross_val_score(best_xgboost, X_train, y_train, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Print the average cross-validation score
print(f"Cross-Validation MSE: {-cv_scores.mean()}")

# Train the best model without early stopping (for now)
best_xgboost.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_xgboost.predict(X_test)

# Calculate RMSE and R² for the best model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Optimized XGBoost RMSE: {rmse}")
print(f"Optimized XGBoost R²: {r2}")


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best Parameters: {'subsample': 1.0, 'reg_lambda': 0, 'reg_alpha': 0.5, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.8}
Cross-Validation MSE: 0.5879619095465315
Optimized XGBoost RMSE: 0.762685486973499
Optimized XGBoost R²: 0.09268630077570139


In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the dataset
df = pd.read_csv('merged_tourism_data.csv')

# Apply one-hot encoding to categorical columns (if any)
df_encoded = pd.get_dummies(df, drop_first=True)

# Define features (X) and target (y)
X = df_encoded.drop(columns=['price'])  # Replace 'price' with your target column name
y = df_encoded['price']  # Replace 'price' with your target column name

# Split data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sample data (ensure you have this step in your actual code if not already)
# For demonstration purposes, let's sample 10% of the training data
X_sample, _, y_sample, _ = train_test_split(X_train, y_train, test_size=0.9, random_state=42)

# Expand the hyperparameter grid for tuning
rf_param_dist = {
    'n_estimators': [100, 200, 300, 500, 1000],
    'max_depth': [3, 5, 7, 10, 12, 15],  # Test deeper trees
    'max_features': ['sqrt', 'log2', None, 0.7, 0.8],  # Test more max_features options
    'min_samples_split': [2, 5, 10, 15],  # Increase min_samples_split for more robustness
    'min_samples_leaf': [1, 2, 4, 6],  # Increase min_samples_leaf for more generalization
    'bootstrap': [True, False]  # Continue testing with both bootstrapped and non-bootstrapped
}

# Perform RandomizedSearchCV for Random Forest tuning with a larger hyperparameter space
rf_random_search = RandomizedSearchCV(
    estimator=rf_model, param_distributions=rf_param_dist,
    n_iter=100, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1, random_state=42
)

# Fit the randomized search to a sample of the dataset to speed up tuning
rf_random_search.fit(X_sample, y_sample)

# Best parameters found for Random Forest
print("Random Forest Best Parameters:", rf_random_search.best_params_)

# Reinitialize the Random Forest model with the best found parameters
best_rf_model = RandomForestRegressor(**rf_random_search.best_params_, random_state=42)

# Cross-validation for Random Forest with the best parameters
rf_cv_scores = cross_val_score(best_rf_model, X_train, y_train, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Print the average cross-validation score
print(f"Random Forest Cross-Validation MSE: {-rf_cv_scores.mean()}")

# Train the best Random Forest model
best_rf_model.fit(X_train, y_train)

# Make predictions on the test set
rf_y_pred = best_rf_model.predict(X_test)

# Calculate RMSE and R² for Random Forest
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_y_pred))
rf_r2 = r2_score(y_test, rf_y_pred)

# Print results for Random Forest
print(f"Random Forest RMSE: {rf_rmse}")
print(f"Random Forest R²: {rf_r2}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Random Forest Best Parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 3, 'bootstrap': False}
Random Forest Cross-Validation MSE: 142939366.91477525
Random Forest RMSE: 11970.009402249134
Random Forest R²: 0.00045257794647701743


In [11]:
import pandas as pd
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the dataset
df = pd.read_csv('merged_tourism_data.csv')

# Apply one-hot encoding to categorical columns (if any)
df_encoded = pd.get_dummies(df, drop_first=True)

# Define features (X) and target (y)
X = df_encoded.drop(columns=['price'])  # Replace 'price' with your target column name
y = df_encoded['price']  # Replace 'price' with your target column name

# Split data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Try Ridge, Lasso, and ElasticNet regression
ridge_model = Ridge(alpha=5.0)  # Try increasing alpha for stronger regularization
lasso_model = Lasso(alpha=0.1, max_iter=10000)  # Increase max_iter for convergence
elasticnet_model = ElasticNet(alpha=0.1, l1_ratio=0.7, max_iter=10000)  # ElasticNet combines Lasso and Ridge

# Cross-validation for Ridge
ridge_cv_scores = cross_val_score(ridge_model, X_train_scaled, y_train, cv=3, scoring='neg_mean_squared_error')
# Cross-validation for Lasso
lasso_cv_scores = cross_val_score(lasso_model, X_train_scaled, y_train, cv=3, scoring='neg_mean_squared_error')
# Cross-validation for ElasticNet
elasticnet_cv_scores = cross_val_score(elasticnet_model, X_train_scaled, y_train, cv=3, scoring='neg_mean_squared_error')

# Print the average cross-validation scores
print(f"Ridge Cross-Validation MSE: {-ridge_cv_scores.mean()}")
print(f"Lasso Cross-Validation MSE: {-lasso_cv_scores.mean()}")
print(f"ElasticNet Cross-Validation MSE: {-elasticnet_cv_scores.mean()}")

# Train Ridge, Lasso, and ElasticNet models
ridge_model.fit(X_train_scaled, y_train)
lasso_model.fit(X_train_scaled, y_train)
elasticnet_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
ridge_y_pred = ridge_model.predict(X_test_scaled)
lasso_y_pred = lasso_model.predict(X_test_scaled)
elasticnet_y_pred = elasticnet_model.predict(X_test_scaled)

# Calculate RMSE and R² for Ridge, Lasso, and ElasticNet
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_y_pred))
ridge_r2 = r2_score(y_test, ridge_y_pred)

lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_y_pred))
lasso_r2 = r2_score(y_test, lasso_y_pred)

elasticnet_rmse = np.sqrt(mean_squared_error(y_test, elasticnet_y_pred))
elasticnet_r2 = r2_score(y_test, elasticnet_y_pred)

# Print results for Ridge, Lasso, and ElasticNet
print(f"Ridge RMSE: {ridge_rmse}")
print(f"Ridge R²: {ridge_r2}")

print(f"Lasso RMSE: {lasso_rmse}")
print(f"Lasso R²: {lasso_r2}")

print(f"ElasticNet RMSE: {elasticnet_rmse}")
print(f"ElasticNet R²: {elasticnet_r2}")


Ridge Cross-Validation MSE: 143134077.13793644
Lasso Cross-Validation MSE: 143133853.81420398
ElasticNet Cross-Validation MSE: 143112327.52779612
Ridge RMSE: 11982.387145935858
Ridge R²: -0.0016156808432832293
Lasso RMSE: 11982.365070835733
Lasso R²: -0.0016119903021907245
ElasticNet RMSE: 11981.544480604554
ElasticNet R²: -0.001474807890168739
