In [43]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import joblib

In [44]:
# load cleaned data
df = pd.read_excel('../data/cleaned_data/cleaned_tourism_data.xlsx')

In [45]:
df.head()

Unnamed: 0,TransactionId,UserId,VisitYear,VisitMonth,VisitModeId,AttractionId,Rating,ContinentId,RegionId,CountryId,CityId,CityName,Country,Region,Continent,AttractionTypeId,Attraction,AttractionAddress,AttractionType,VisitMode
0,3,70456,2022,10,2,640,5,5,21,163,4341,Guildford,United Kingdom,Western Europe,Europe,63,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",Nature & Wildlife Areas,Couples
1,8,7567,2022,10,4,640,5,2,8,48,464,Ontario,Canada,Northern America,America,63,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",Nature & Wildlife Areas,Friends
2,9,79069,2022,10,3,640,5,2,9,54,774,Brazil,Brazil,South America,America,63,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",Nature & Wildlife Areas,Family
3,10,31019,2022,10,3,640,3,5,17,135,583,Zurich,Switzerland,Central Europe,Europe,63,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",Nature & Wildlife Areas,Family
4,15,43611,2022,10,2,640,3,5,21,163,1396,Manchester,United Kingdom,Western Europe,Europe,63,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",Nature & Wildlife Areas,Couples


In [46]:
# quick sanity check
print(df.isna().sum())
print(f"\n{df.duplicated().sum()} duplicate rows found.")

TransactionId        0
UserId               0
VisitYear            0
VisitMonth           0
VisitModeId          0
AttractionId         0
Rating               0
ContinentId          0
RegionId             0
CountryId            0
CityId               0
CityName             0
Country              0
Region               0
Continent            0
AttractionTypeId     0
Attraction           0
AttractionAddress    0
AttractionType       0
VisitMode            0
dtype: int64

0 duplicate rows found.


In [47]:
target = 'Rating'

X = df.drop(columns=[
        "Rating", 
        "TransactionId",
        "UserId",
        "CityId",
        "CountryId",
        "RegionId",
        "ContinentId",
        "AttractionId"
    ])
y = df["Rating"]


In [48]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(include="number").columns

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

In [50]:
lr_pipeline = Pipeline([
    ("prep", preprocessor),
    ("model", LinearRegression())
])

lr_pipeline.fit(X_train, y_train)

y_pred_lr = lr_pipeline.predict(X_test)

print("Linear Regression Results")
print("MAE:", mean_absolute_error(y_test, y_pred_lr))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print("R²:", r2_score(y_test, y_pred_lr))

Linear Regression Results
MAE: 0.7407880437756948
RMSE: 0.958919222964803
R²: 0.03220520118351111


In [51]:
xgb_pipeline = Pipeline([
    ("prep", preprocessor),
    ("model", XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        random_state=42
    ))
])

xgb_pipeline.fit(X_train, y_train)

y_pred_xgb = xgb_pipeline.predict(X_test)

print("\nXGBoost Results")
print("MAE:", mean_absolute_error(y_test, y_pred_xgb))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_xgb)))
print("R²:", r2_score(y_test, y_pred_xgb))


XGBoost Results
MAE: 0.7166757583618164
RMSE: 0.9125107856008556
R²: 0.12361443042755127


In [52]:
# dump models
joblib.dump(lr_pipeline, '../models/linear_regression_model.joblib')
joblib.dump(xgb_pipeline, '../models/xgboost_model.joblib')

['../models/xgboost_model.joblib']