In [19]:
# import libraries
import pandas as pd
import numpy as np 

In [20]:
# load dataset
df = pd.read_excel("../data/cleaned_data/cleaned_tourism_data.xlsx")

In [21]:
df.head()

Unnamed: 0,TransactionId,UserId,VisitYear,VisitMonth,VisitModeId,AttractionId,Rating,ContinentId,RegionId,CountryId,CityId,CityName,Country,Region,Continent,AttractionTypeId,Attraction,AttractionAddress,AttractionType,VisitMode
0,3,70456,2022,10,2,640,5,5,21,163,4341,Guildford,United Kingdom,Western Europe,Europe,63,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",Nature & Wildlife Areas,Couples
1,8,7567,2022,10,4,640,5,2,8,48,464,Ontario,Canada,Northern America,America,63,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",Nature & Wildlife Areas,Friends
2,9,79069,2022,10,3,640,5,2,9,54,774,Brazil,Brazil,South America,America,63,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",Nature & Wildlife Areas,Family
3,10,31019,2022,10,3,640,3,5,17,135,583,Zurich,Switzerland,Central Europe,Europe,63,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",Nature & Wildlife Areas,Family
4,15,43611,2022,10,2,640,3,5,21,163,1396,Manchester,United Kingdom,Western Europe,Europe,63,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",Nature & Wildlife Areas,Couples


In [28]:
user_item_matrix = df.pivot_table(
    index='UserId',
    columns='AttractionId',
    values='Rating'
)

user_means = user_item_matrix.mean(axis=1)
matrix_demeaned = user_item_matrix.sub(user_means, axis=0)
matrix_demeaned = matrix_demeaned.fillna(0)
user_item_matrix.shape



(33526, 30)

In [37]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=7, random_state=42)
latent_matrix = svd.fit_transform(matrix_demeaned)

reconstructed_matrix = latent_matrix @ svd.components_
predicted_ratings = reconstructed_matrix + user_means.values.reshape(-1, 1)

predicted_ratings = pd.DataFrame(
    predicted_ratings,
    index=user_item_matrix.index,
    columns=user_item_matrix.columns
)



In [44]:
def recommend_collaborative_based(user_id, predicted_ratings, original_df, top_n=10):
    
    user_predictions = predicted_ratings.loc[user_id]
    
    # Already visited
    visited = original_df[original_df['UserId'] == user_id]['AttractionId']
    
    # Remove visited
    recommendations = user_predictions.drop(visited, errors='ignore')
    
    # Sort
    top_recommendations = recommendations.sort_values(ascending=False).head(top_n)
    
    # Merge with attraction names
    result = pd.DataFrame({
        'AttractionId': top_recommendations.index,
        'PredictedRating': top_recommendations.values
    })
    
    result = result.merge(
        original_df[['AttractionId', 'Attraction']].drop_duplicates(),
        on='AttractionId',
        how='left'
    )
    
    return result


In [45]:
recommend_collaborative_based(user_id=14, predicted_ratings=predicted_ratings, original_df=df)


Unnamed: 0,AttractionId,PredictedRating,Attraction
0,650,4.512847,Sanur Beach
1,481,4.508844,Nusa Dua Beach
2,1297,4.504169,Yogyakarta Palace
3,1133,4.500641,Jomblang Cave
4,877,4.500206,Balekambang Beach
5,975,4.500167,Sempu Island
6,1238,4.500157,Sewu Temple
7,1137,4.500119,Kalibiru National Park
8,947,4.500094,Mount Semeru Volcano
9,920,4.500068,Jodipan Colorful Village


In [47]:
content_df = df[['AttractionId', 'Attraction', 
                 'AttractionType', 'CityName', 
                 'Country', 'Region', 
                 'Continent', 'VisitMode']].drop_duplicates()

content_df = df.groupby('AttractionId').agg({
    'Attraction': 'first',
    'AttractionType': 'first',
    'CityName': 'first',
    'Country': 'first',
    'Region': 'first',
    'Continent': 'first',
    'VisitMode': 'first'
}).reset_index()


content_df['combined_features'] = (
    content_df['AttractionType'] + " " +
    content_df['CityName'] + " " +
    content_df['Country'] + " " +
    content_df['Region'] + " " +
    content_df['Continent'] + " " +
    content_df['VisitMode']
)


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(content_df['combined_features'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [None]:
indices = pd.Series(content_df.index, index=content_df['AttractionId'])

def recommend_content_based(attraction_id, top_n=5):
    
    idx = indices[attraction_id]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
        
    attraction_indices = [i[0] for i in sim_scores]
    
    return content_df.iloc[attraction_indices][['AttractionId', 'Attraction']]


In [50]:
recommend_content_based(650)

Unnamed: 0,AttractionId,Attraction
4,673,Seminyak Beach
13,913,Goa Cina Beach
0,369,Kuta Beach - Bali
9,841,Waterbom Bali
10,877,Balekambang Beach


In [51]:
import joblib

joblib.dump(svd, "../models/svd_model.pkl")
joblib.dump(predicted_ratings, "../models/predicted_ratings.pkl")
joblib.dump(tfidf, "../models/tfidf.pkl")
joblib.dump(cosine_sim, "../models/cosine_sim.pkl")
joblib.dump(content_df, "../models/content_df.pkl")
joblib.dump(indices, "../models/indices.pkl")


['../models/indices.pkl']