In [2]:
# Importing necessary libraries
import json
import glob
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go


from sklearn.preprocessing import normalize, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
# from sklearn.metrics import r2_score
# from sklearn.cluster import KMeans


from surprise import Dataset, Reader, KNNBasic, SVD, NMF, KNNWithMeans, SVDpp
from surprise.model_selection import train_test_split
from surprise import accuracy as sup_accuracy
from surprise.prediction_algorithms.matrix_factorization import NMF
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise.prediction_algorithms.matrix_factorization import SVDpp
from surprise.model_selection import cross_validate
from surprise import accuracy

import warnings

from DataProcessor import DataProcessor, PerformanceMetrics, recommend_amenities, recommend_attraction, recommend_country,  get_similar_attractions, RecommendationEngine

### Collaborative Filtering: 

#### 1. Read Data

In [3]:
#read travel info data
info = pd.read_csv('../data/test_travel_data.csv')
info.columns

Index(['id', 'type', 'subcategories', 'name', 'description', 'rating',
       'latitude', 'longitude', 'numberOfReviews', 'amenities', 'LowerPrice',
       'UpperPrice', 'Rank', 'Total', 'Location', 'RankingType', 'country',
       'city', 'regional_rating'],
      dtype='object')

In [4]:
#read review data
reviews = pd.read_csv('../data/test_reviews_data.csv', index_col=0)
reviews.reset_index(inplace=True)
reviews.columns

Index(['userId', 'placeid', 'rating', 'lang', 'text'], dtype='object')

In [29]:
reviews.head(5)

Unnamed: 0,userId,placeid,rating,lang,text
0,A87669AAD9DA05FFBD46F1334B329FFD,2189822,5,en,Gee is a passionate tour guide. The animals a...
1,9FFED7DDC68883BBB8F4024333970E9A,2189822,5,en,Lots to see. Easy to get to from the Safari Lo...
2,B5E56A483B579518DDD82A3DA0E94487,2189822,4,en,a kind of a zoo for injured and saved animals ...
3,882D0A6C7152105BB0D83C84F3CB160D,2189822,5,en,They do great rehabilitating injured animals. ...
4,203EBC7F3F51AAAA39A87D2E58842C76,2189822,5,en,"We took the guided tour from Isaac, who was gr..."


In [10]:
matched_reviews_info = info[info['id'].isin(reviews['placeid'])]
# print(matched_reviews_info)
merged_reviews_info = pd.merge(matched_reviews_info, reviews, left_on='id', right_on='placeid', how='inner')
# merged_reviews_info.columns
print(merged_reviews_info)
merged_reviews_info.to_csv('merged_reviews_info.csv', index=False)


             id        type       subcategories           name  \
0       7945044  ATTRACTION  Sights & Landmarks   Kuminda Farm   
1       7945044  ATTRACTION  Sights & Landmarks   Kuminda Farm   
2       7945044  ATTRACTION  Sights & Landmarks   Kuminda Farm   
3       7945044  ATTRACTION  Sights & Landmarks   Kuminda Farm   
4       7945044  ATTRACTION  Sights & Landmarks   Kuminda Farm   
...         ...         ...                 ...            ...   
31074  14903233  ATTRACTION      Nature & Parks  Mutanda Falls   
31075  14903233  ATTRACTION      Nature & Parks  Mutanda Falls   
31076  14903233  ATTRACTION      Nature & Parks  Mutanda Falls   
31077  14903233  ATTRACTION      Nature & Parks  Mutanda Falls   
31078  14903233  ATTRACTION      Nature & Parks  Mutanda Falls   

                                             description  rating_x  latitude  \
0      It's a small farm and we are into Argo tourism...       5.0    -20.98   
1      It's a small farm and we are into Argo t

In [20]:
# print(df.head(10))
info_attraction = info.loc[info['type'] == 'ATTRACTION']
info_attraction

Unnamed: 0,id,type,subcategories,name,description,rating,latitude,longitude,numberOfReviews,amenities,LowerPrice,UpperPrice,Rank,Total,Location,RankingType,country,city,regional_rating
0,7945044,ATTRACTION,Sights & Landmarks,Kuminda Farm,It's a small farm and we are into Argo tourism...,5.0,-20.98,27.25,22,bathroom only,141.0,281.0,1.0,3.0,Francistown,things to do,Botswana,Francistown,3.000000
1,1743605,ATTRACTION,"Casinos & Gambling, Fun & Games",Gaborone Sun,Botswana is where the white-hot Kalahari Deser...,4.0,-24.65,25.93,124,bathroom only,141.0,281.0,4.0,25.0,Gaborone,things to do,Botswana,Gaborone,6.250000
17,311243,ATTRACTION,"Sights & Landmarks, Nature & Parks",Tsodilo Hills,An important rock art site in the western Kala...,4.0,-18.77,21.75,85,bathroom only,141.0,281.0,1.0,1.0,Shakawe,things to do,Botswana,Shakawe,1.000000
22,2189822,ATTRACTION,Nature & Parks,CARACAL Biodiversity Center,The CARACAL World of Wildlife offers an unique...,4.5,-17.81,25.15,98,bathroom only,141.0,281.0,1.0,4.0,Kasane,things to do,Botswana,Kasane,4.000000
28,12948270,ATTRACTION,Nature & Parks,Chobe Crocodile Farm,"Crocodile Farm, we raise Nile Crocodiles, we o...",5.0,-17.80,25.23,5,bathroom only,141.0,281.0,3.0,4.0,Kasane,things to do,Botswana,Kasane,1.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13988,479685,ATTRACTION,Nature & Parks,Mosi-oa-Tunya National Park,This protected wildlife reserve is small at 66...,4.5,-17.93,25.86,356,bathroom only,141.0,281.0,3.0,19.0,Livingstone,things to do,Zambia,Livingstone,6.333333
13989,4310058,ATTRACTION,Nature & Parks,Parays Game Farm Wildlife Sanctuary,Interested in seeing/interacting with white li...,4.5,-11.75,28.72,6,bathroom only,141.0,281.0,14.0,33.0,Lusaka,things to do,Zambia,Lusaka,2.357143
14008,1837467,ATTRACTION,Nature & Parks,The Night Encounter,With the awe inspiring spectacle of a star-lad...,5.0,-17.85,25.85,3,bathroom only,141.0,281.0,16.0,19.0,Livingstone,things to do,Zambia,Livingstone,1.187500
14015,1985388,ATTRACTION,Nature & Parks,Chimfunshi Wildlife Orphanage,Chimfunshi Wildlife Orphanage Trust is one of ...,4.5,-12.53,27.86,48,bathroom only,141.0,281.0,1.0,2.0,Chingola,things to do,Zambia,Chingola,2.000000


#### 2. Preprocessing

In [5]:
# Convert object columns to categorical

info['type'] = info['type'].astype('category')
info['amenities'] = info['amenities'].astype('category')
info['subcategories'] = info['subcategories'].astype('category')

processor = DataProcessor(info)
clean_info = processor.clean_df

In [6]:
clean_info

Unnamed: 0,id,type,subcategories,name,description,rating,latitude,longitude,numberOfReviews,amenities,LowerPrice,UpperPrice,Rank,Total,Location,RankingType,country,city,regional_rating
0,7945044,ATTRACTION,Sights & Landmarks,Kuminda Farm,It's a small farm and we are into Argo tourism...,5.0,-20.98,27.25,22,bathroom only,141.000000,281.000000,1.0,3.0,Francistown,things to do,Botswana,Francistown,3.000000
1,1743605,ATTRACTION,"Casinos & Gambling, Fun & Games",Gaborone Sun,Botswana is where the white-hot Kalahari Deser...,4.0,-24.65,25.93,124,bathroom only,141.000000,281.000000,4.0,25.0,Gaborone,things to do,Botswana,Gaborone,6.250000
2,4162082,HOTEL,Specialty Lodging,Pelican Lodge & Camping,"The lodge is situated in Nata, Botswana. Clien...",3.0,-20.22,26.23,189,"['Pool', 'Internet', 'Free Internet', 'Free pa...",11533.000000,13361.000000,4.0,9.0,Nata,Specialty lodging,Botswana,Nata,2.250000
3,1068835,HOTEL,Bed and Breakfast,Serowe Hotel,"Set in beautiful gardens, the Serowe Hotel is ...",3.5,-22.41,26.72,79,"['Pool', 'Internet', 'Free parking', 'Restaura...",6751.000000,6892.000000,1.0,7.0,Serowe,Specialty lodging,Botswana,Serowe,7.000000
4,13477853,HOTEL,Specialty Lodging,Leopard Plains,Experience in the middle of the bushveld a lux...,4.5,-20.18,24.17,56,"['Pool', 'Restaurant', 'Wifi', 'Breakfast incl...",32911.000000,41632.000000,10.0,81.0,Maun,Specialty lodging,Botswana,Maun,8.100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14245,6542212,HOTEL,Bed and Breakfast,Lima Garden Guest House,See why so many travelers make Lima Garden Gue...,2.5,-15.38,28.30,4,"['Pool', 'Room service', 'Restaurant', 'Air co...",13777.392428,23045.636246,46.0,130.0,Lusaka,Specialty lodging,Zambia,Lusaka,2.826087
14246,2666970,HOTEL,Bed and Breakfast,Manda Hill Lodge,"Born in South Africa and taken to the world, N...",1.5,-15.39,28.31,6,"['Internet', 'Kids Activities', 'Suites', 'Roo...",6329.000000,12658.000000,48.0,130.0,Lusaka,Specialty lodging,Zambia,Lusaka,2.708333
14247,8118425,HOTEL,Bed and Breakfast,BestBed Executive Lodge,See why so many travelers make BestBed Executi...,4.0,-12.97,28.62,1,"['Kids Activities', 'Room service', 'Free park...",13777.392428,23045.636246,9.0,21.0,Ndola,Specialty lodging,Zambia,Ndola,2.333333
14248,1999874,HOTEL,Specialty Lodging,Four Pillars Lodge,Four Pillars Lodge is one of the leading provi...,3.0,-15.41,28.33,14,"['Pool', 'Internet', 'Suites', 'Free Internet'...",10689.000000,10830.000000,63.0,156.0,Lusaka,Specialty lodging,Zambia,Lusaka,2.476190


#### 3. Modelling
Train SVD & KNN model based on known user's(userId) rating(rating_y) for some item(placeid), then select the top few rated items as recommendations by predicting ratings for all unrated items.


In [11]:

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(merged_reviews_info[['userId', 'placeid', 'rating_y']], reader)

trainset, testset = train_test_split(data, test_size=0.2)

print('---Training SVD---')
svd = SVD()
svd.fit(trainset)
svd_predictions = svd.test(testset)
svd_rmse = accuracy.rmse(svd_predictions)

print('---Training KNNWithMeans---')
knn = KNNWithMeans()
knn.fit(trainset)
knn_predictions = knn.test(testset)
knn_rmse = accuracy.rmse(knn_predictions)

print('---Validate SVD---')
svd_cv = cross_validate(SVD(), data, measures=['RMSE'], cv=5, verbose=True)
print('---Validate KNNWithMeans---')
knn_cv = cross_validate(KNNWithMeans(), data, measures=['RMSE'], cv=5, verbose=True)

# Suppose we need to generate recommendations for a certain user (user_id)user_id = '2A5836BDB7EDEEB43BF1F6AE6A9C28F7'
user_ratings = merged_reviews_info[merged_reviews_info['userId'] == user_id]

all_items = merged_reviews_info['placeid'].unique()
rated_items = user_ratings['placeid'].unique()
unrated_items = [item for item in all_items if item not in rated_items]

predictions = [(item, svd.predict(user_id, item).est) for item in unrated_items]
predictions.sort(key=lambda x: x[1], reverse=True)

top_n = 10
recommendations = predictions[:top_n]

print(f"SVD: Top {top_n} recommendations for user {user_id}: {recommendations}")


knn_predictions = [(item, knn.predict(user_id, item).est) for item in unrated_items]
knn_predictions.sort(key=lambda x: x[1], reverse=True)

knn_recommendations = knn_predictions[:top_n]

print(f"KNNWithMeans: Top {top_n} recommendations for user {user_id}: {knn_recommendations}")

svd_predictions = svd.test(testset)
knn_predictions = knn.test(testset)

svd_rmse = accuracy.rmse(svd_predictions)
knn_rmse = accuracy.rmse(knn_predictions)

print(f'SVD RMSE: {svd_rmse}')
print(f'KNNWithMeans RMSE: {knn_rmse}')

# Get similar users by finding nearest neighbors for group recommendation
user_inner_id = knn.trainset.to_inner_uid(user_id)  
neighbors = knn.get_neighbors(user_inner_id, k=3)  # Get the internal IDs of the K most similar neighbors

neighbors = [knn.trainset.to_raw_uid(inner_id) for inner_id in neighbors]

print(f"The 3 nearest neighbors of user {user_id} are: {neighbors}")

for neighbor in neighbors:
    neighbor_ratings = merged_reviews_info[merged_reviews_info['userId'] == neighbor]
    print(f"Ratings by neighbor {neighbor}:")
    print(neighbor_ratings)



---Training SVD---
RMSE: 0.6543
---Training KNNWithMeans---
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7257
---Validate SVD---
Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6483  0.6412  0.6561  0.6448  0.6453  0.6471  0.0050  
Fit time          0.22    0.21    0.21    0.21    0.22    0.21    0.00    
Test time         0.02    0.02    0.02    0.02    0.02    0.02    0.00    
---Validate KNNWithMeans---
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  F

NameError: name 'user_id' is not defined

In [49]:
# import pandas as pd
# from surprise import Dataset, Reader, KNNWithMeans, accuracy
# from surprise.model_selection import train_test_split, cross_validate
# # 将DataFrame转换为Surprise数据集格式

# reader = Reader(rating_scale=(1, 5))
# data = Dataset.load_from_df(merged_reviews_info[['userId', 'placeid', 'rating_y']], reader)

# trainset, testset = train_test_split(data, test_size=0.2)
#
# print('---Training KNNWithMeans---')
# sim_options = {
#     'name': 'cosine',
#     'user_based': True
# }
# knn = KNNWithMeans(k=3, sim_options=sim_options)  # 设定K值为3
# knn.fit(trainset)
# knn_predictions = knn.test(testset)
# knn_rmse = accuracy.rmse(knn_predictions)

# user_ratings = merged_reviews_info[merged_reviews_info['userId'] == user_id]
# all_items = merged_reviews_info['placeid'].unique()
# rated_items = user_ratings['placeid'].unique()
# unrated_items = [item for item in all_items if item not in rated_items]

# knn_predictions = [(item, knn.predict(user_id, item).est) for item in unrated_items]
# knn_predictions.sort(key=lambda x: x[1], reverse=True)

# print("All predictions:")
# for item, score in knn_predictions:
#     print(f"Item: {item}, Predicted Score: {score}")

# top_n = 10
# knn_recommendations = knn_predictions[:top_n]

# print(f"Top {top_n} recommendations for user {user_id} using KNNWithMeans: {knn_recommendations}")

# knn_predictions = knn.test(testset)

# knn_rmse = accuracy.rmse(knn_predictions)

# print(f'KNNWithMeans RMSE: {knn_rmse}')

# user_id = 'A38CEED23744265FCA6202C3869DFE93'
# user_inner_id = knn.trainset.to_inner_uid(user_id) 
# neighbors = knn.get_neighbors(user_inner_id, k=3) 

# neighbors = [knn.trainset.to_raw_uid(inner_id) for inner_id in neighbors]
# print(f"The 3 nearest neighbors of user {user_id} are: {neighbors}")

# for neighbor in neighbors:
#     neighbor_ratings = merged_reviews_info[merged_reviews_info['userId'] == neighbor]
#     print(f"Ratings by neighbor {neighbor}:")
#     print(neighbor_ratings)


---Training KNNWithMeans---
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.7376
All predictions:
Item: 24091522, Predicted Score: 4.691951896392229
Item: 23854639, Predicted Score: 4.691951896392229
Item: 20174710, Predicted Score: 4.691951896392229
Item: 15561037, Predicted Score: 4.691951896392229
Item: 19204589, Predicted Score: 4.691951896392229
Item: 23591002, Predicted Score: 4.691951896392229
Item: 14251907, Predicted Score: 4.691951896392229
Item: 20195008, Predicted Score: 4.691951896392229
Item: 7945044, Predicted Score: 4.0
Item: 1743605, Predicted Score: 4.0
Item: 311243, Predicted Score: 4.0
Item: 2189822, Predicted Score: 4.0
Item: 12948270, Predicted Score: 4.0
Item: 479156, Predicted Score: 4.0
Item: 5975965, Predicted Score: 4.0
Item: 7110471, Predicted Score: 4.0
Item: 8476619, Predicted Score: 4.0
Item: 8801391, Predicted Score: 4.0
Item: 9564618, Predicted Score: 4.0
Item: 20171007, Predicted Score: 4.0
Item: 8643447, Predicted S

### Content-based Filtering: 

It bases its recommendations on content description information (attractions/hotels/restaurant/locations) to visit, etc.).

The advantage of content-based recommendation is that it can provide accurate recommendations based on the user's personalized needs and does not require additional information such as user ratings. However, this algorithm requires high accuracy and completeness of (attractions/hotels/restaurant/locations) descriptions, and may not be effective for new users or cold (attractions/hotels/restaurant/locations) recommendations.

In [65]:
info = pd.read_csv('../data/test_travel_data.csv')

vectorization_columns = info[['name', 'subcategories', 'amenities']]

df = info[info["type"]== "ATTRACTION"]

df['combined_text'] = df['subcategories'] + " " + df['description'] + " " + df['amenities']

# Apply TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# tfidfv_matrix2=tfidfv2.fit_transform(info['description'])
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_text'])

# Compute cosine similarity matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

# Find the attraction which is similar with "Kuminda Farm"
similar_attractions = get_similar_attractions('Kuminda Farm', df, cosine_sim_matrix)
print("Recommended Attractions:" )
print(similar_attractions)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined_text'] = df['subcategories'] + " " + df['description'] + " " + df['amenities']


Recommended Attractions:
           id        type       subcategories                    name  \
958   2706237  ATTRACTION  Sights & Landmarks  The Heavenly Cathedral   
427  13293550  ATTRACTION  Sights & Landmarks               Positivos   

                                           description  rating  latitude  \
958                                                        5.0     27.89   
427  Cultural place in Mindelo with Live Music, Cin...     4.0     16.89   

     longitude  numberOfReviews      amenities  LowerPrice  UpperPrice  Rank  \
958      34.30             1087  bathroom only       141.0       281.0   1.0   
427     -24.99                4  bathroom only       141.0       281.0  12.0   

     Total         Location   RankingType     country             city  \
958   50.0  Sharm El Sheikh  things to do       Egypt  Sharm El Sheikh   
427   16.0          Mindelo  things to do  Cape Verde          Mindelo   

     regional_rating                                      comb