# TRAVEL  DESTINATION RECOMMENDATION SYSTEM
## Modelling

In [1]:
# Importing necessary libraries
import pandas as pd
import json
import glob
import re


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from surprise import Dataset, Reader, KNNBasic, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

import warnings

# Ignore future deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)

sns.set_style('darkgrid')

Step 1: Prepare the data

Load the sample data into a suitable data structure, such as a pandas DataFrame.
Preprocess the data if necessary, including handling missing values, converting categorical variables to numerical representations, and normalizing numerical features.
Step 2: Split the data

Split the data into training and testing sets. Typically, an 80-20 split is used, but you can adjust the ratio based on the size of your dataset.
Step 3: Choose recommendation models

There are several recommendation models you can choose from, depending on the nature of your data and the problem you want to solve. Here are a few popular models:
Collaborative Filtering: This approach recommends items based on users' past behavior and preferences.
Content-Based Filtering: This approach recommends items based on the similarity between items' characteristics and users' preferences.
Matrix Factorization: This approach decomposes the user-item rating matrix to find latent factors and make recommendations.
Neural Networks: You can also use deep learning models like neural networks for recommendation tasks.
Step 4: Train and evaluate the models

For each model you choose, train it using the training set.
Evaluate the trained model's performance using appropriate evaluation metrics such as precision, recall, or Mean Average Precision (MAP).
Repeat the training and evaluation process for each model.

Step 5: Choose the best model

Compare the performance of the different models based on the evaluation metrics.
Select the model that performs best according to your evaluation criteria.

Step 6: Fine-tune and optimize the chosen model

Once you have selected the best model, you can further fine-tune and optimize its hyperparameters using techniques like cross-validation or grid search.

Step 7: Deploy the recommendation system

Once you are satisfied with the performance of your chosen and optimized model, you can deploy it to make real-time recommendations.

In [2]:
#loading 'clean_data' into df
clean_df = pd.read_csv('Data/clean_data.csv')

In [3]:
clean_df.columns

Index(['id', 'type', 'subcategories', 'name', 'locationString', 'description',
       'rating', 'latitude', 'longitude', 'numberOfReviews', 'amenities',
       'LowerPrice', 'UpperPrice', 'RankingType', 'Rank', 'Total',
       'regional_rating', 'country', 'city'],
      dtype='object')

In [4]:
clean_df.shape

(14484, 19)

### 1. Prepare the data

##### * Dealing with outliers in the numerical columns

In [5]:
# Select the numerical features for clustering
numerical_columns = clean_df.select_dtypes(include=[np.number]).columns
numerical_data = clean_df[numerical_columns]

# Apply K-means clustering
kmeans = KMeans(n_clusters=3)  # Specify the number of clusters
kmeans.fit(numerical_data)

# Assign each data point to a cluster
labels = kmeans.labels_

# Identify the cluster with the outliers
outlier_cluster = np.argmax(np.bincount(labels))

# Remove the rows belonging to the outlier cluster
clean_df = clean_df[labels != outlier_cluster]

#### * Cleaning and transforming textual data

In [6]:
textual_data = clean_df[['subcategories', 'RankingType', 'locationString', 'country', 'city', 'amenities']]
textual_data

Unnamed: 0,subcategories,RankingType,locationString,country,city,amenities
0,Specialty Lodging,Specialty lodging,"Rumangabo, North Kivu Province",Democratic Republic of the Congo,Rumangabo,"Restaurant, Mountain View"
3,Bed and Breakfast,Specialty lodging,Kinshasa,Democratic Republic of the Congo,Kinshasa,"Internet, Room service, Free Internet, Free pa..."
12,Bed and Breakfast,Specialty lodging,"Goma, North Kivu Province",Democratic Republic of the Congo,Goma,"Restaurant, Kids Activities, Suites, Room serv..."
14,Hotel,hotels,Kinshasa,Democratic Republic of the Congo,Kinshasa,"Internet, Room service, Free Internet, Free pa..."
16,Hotel,hotels,Kinshasa,Democratic Republic of the Congo,Kinshasa,"Kids Activities, Free parking, Restaurant, Bar..."
...,...,...,...,...,...,...
14477,Specialty Lodging,Specialty lodging,"Rabil, Boa Vista",Cape Verde,Rabil,"Internet, Suites, Free Internet, Free parking,..."
14478,Bed and Breakfast,Specialty lodging,"Praia, Santiago",Cape Verde,Praia,"Internet, Kids Activities, Room service, Free ..."
14479,Specialty Lodging,Specialty lodging,"Sal Rei, Boa Vista",Cape Verde,Sal Rei,"Internet, Free Internet, Free parking, Kitchen..."
14482,Bed and Breakfast,Specialty lodging,"Sao Filipe, Fogo",Cape Verde,Sao Filipe,"Beachfront, Room service, Free parking, Restau..."


In [7]:
# Convert object columns to categorical
clean_df['type'] = clean_df['type'].astype('category')
clean_df['amenities'] = clean_df['amenities'].astype('category')
clean_df['subcategories'] = clean_df['subcategories'].astype('category')
#clean_df['locationString'] = clean_df['locationString'].astype('category')

In [8]:
# Create a list of unique values in the column
unique_subcategory_values = list(clean_df["subcategories"].unique())

# Create a dictionary that maps each unique value to a unique number
subcategory_map = {}
for index, value in enumerate(unique_subcategory_values):
    subcategory_map[value] = index + 1
    
# Create a new column with the encoded values
clean_df['subcategories_mapped'] = clean_df['subcategories'].map(subcategory_map)

In [9]:
# Create a list of unique values in the column
unique_ammenities_values = list(clean_df["amenities"].unique())

# Create a dictionary that maps each unique value to a unique number
amenities_mapping = {}
for index, value in enumerate(unique_ammenities_values):
    amenities_mapping[value] = index + 1

# Use the map() function to map the values in the column to their respective numbers
clean_df["amenities_mapped"] = clean_df["amenities"].map(amenities_mapping)

In [10]:
# Create a list of unique values in the column
unique_RankingType_values = list(clean_df["RankingType"].unique())

# Create a dictionary that maps each unique value to a unique number
RankingType_mapping = {}
for index, value in enumerate(unique_RankingType_values):
    RankingType_mapping[value] = index + 1

# Use the map() function to map the values in the column to their respective numbers
clean_df["RankingType_mapped"] = clean_df["RankingType"].map(RankingType_mapping)

In [11]:
# Create a list of unique values in the column
unique_RankingType_values = list(clean_df["RankingType"].unique())

# Create a dictionary that maps each unique value to a unique number
RankingType_mapping = {}
for index, value in enumerate(unique_RankingType_values):
    RankingType_mapping[value] = index + 1

# Use the map() function to map the values in the column to their respective numbers
clean_df["RankingType_mapped"] = clean_df["RankingType"].map(RankingType_mapping)

In [12]:
# Create a list of unique values in the column
unique_RankingType_values = list(clean_df["RankingType"].unique())

# Create a dictionary that maps each unique value to a unique number
RankingType_mapping = {}
for index, value in enumerate(unique_RankingType_values):
    RankingType_mapping[value] = index + 1

# Use the map() function to map the values in the column to their respective numbers
clean_df["RankingType_mapped"] = clean_df["RankingType"].map(RankingType_mapping)

In [13]:
# Create a list of unique values in the column
unique_country_values = list(clean_df["country"].unique())

# Create a dictionary that maps each unique value to a unique number
country_mapping = {}
for index, value in enumerate(unique_country_values):
    country_mapping[value] = index + 1

# Use the map() function to map the values in the column to their respective numbers
clean_df["country_mapped"] = clean_df["country"].map(country_mapping)

In [14]:
# Create a list of unique values in the column
unique_type_values = list(clean_df["type"].unique())

# Create a dictionary that maps each unique value to a unique number
type_mapping = {}
for index, value in enumerate(unique_type_values):
    type_mapping[value] = index + 1

# Use the map() function to map the values in the column to their respective numbers
clean_df["type_mapped"] = clean_df["type"].map(type_mapping)

In [15]:
clean_df.head()

Unnamed: 0,id,type,subcategories,name,locationString,description,rating,latitude,longitude,numberOfReviews,...,Rank,Total,regional_rating,country,city,subcategories_mapped,amenities_mapped,RankingType_mapped,country_mapped,type_mapped
0,8661504,HOTEL,Specialty Lodging,Bukima Tented Camp,"Rumangabo, North Kivu Province",Just outside the Virunga National Park boundar...,4.5,-1.38,29.43,34,...,2.0,3.0,1.5,Democratic Republic of the Congo,Rumangabo,1,1,1,1,1
3,12274281,HOTEL,Bed and Breakfast,Ixoras Hotel,Kinshasa,"Located in Kinshasa, 10 km from Mbatu Museum, ...",5.0,-4.35,15.33,1,...,9.0,67.0,7.444444,Democratic Republic of the Congo,Kinshasa,2,2,1,1,1
12,6865307,HOTEL,Bed and Breakfast,Cap Kivu Hotel,"Goma, North Kivu Province",Cap Kivu Hotel is an excellent choice for trav...,3.5,-1.68,29.21,27,...,3.0,17.0,5.666667,Democratic Republic of the Congo,Goma,2,3,1,1,1
14,10868306,HOTEL,Hotel,Hotel Selton,Kinshasa,Hotel selton is the new concept of hotel in th...,4.0,-4.36,15.21,18,...,10.0,43.0,4.3,Democratic Republic of the Congo,Kinshasa,3,4,2,1,1
16,12237149,HOTEL,Hotel,Hotel Bella Riva,Kinshasa,Finding an ideal budget friendly hotel in Kins...,3.5,-4.3,15.3,3,...,19.0,43.0,2.263158,Democratic Republic of the Congo,Kinshasa,3,5,2,1,1


### * Normalization and Standardization

In [16]:
# Select the numerical columns for normalization
numerical_columns = ['rating', 'Rank', 'Total', 'regional_rating', 'LowerPrice', 'UpperPrice']

# Normalize the numerical columns
scaler = minmaxscaler()
normalized_data = clean_df.copy()
normalized_data[numerical_columns] = scaler(clean_df[numerical_columns])

NameError: name 'normalize' is not defined

In [None]:
clean_df.columns

### Baseline Model

In [17]:
# Load the data into Surprise Dataset format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(clean_df[['id', 'rating', 'Rank']], reader)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Train the model
model = KNNBasic(random_state=42)
model.fit(trainset)

# Evaluate the model
predictions = model.test(testset)
accuracy = accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 43.7478


Root Mean Square Error (RMSE) is a measure of the model's prediction accuracy. In the context of recommendation systems, it quantifies the average difference between the predicted ratings and the actual ratings given by the users. A lower RMSE value indicates better model performance. In this case, the RMSE is 41.8541, which suggests that the model's predictions have a relatively high level of error.

In [18]:
for prediction in predictions1:
    print(f"Predicted rating: {prediction.est:.2f}")
    print(f"Actual rating: {prediction.r_ui:.2f}")
    print("---")

NameError: name 'predictions1' is not defined

In [None]:
threshold = 3  # Define the threshold for positive predictions

true_positives = 0
false_positives = 0
false_negatives = 0

for prediction in predictions1:
    if prediction.est >= threshold:
        if prediction.r_ui >= threshold:
            true_positives += 1
        else:
            false_positives += 1
    elif prediction.r_ui >= threshold:
        false_negatives += 1

precision1 = true_positives / (true_positives + false_positives)
recall1 = true_positives / (true_positives + false_negatives)

print(f"Precision: {precision1:.2f}")
print(f"Recall: {recall1:.2f}")

Precision: Precision measures the proportion of correctly predicted positive instances out of all instances predicted as positive. It indicates how accurate the model is when it predicts positive instances. A precision score of 0.76 means that 76% of the instances predicted as positive were actually positive.

Recall: Recall, also known as sensitivity or true positive rate, measures the proportion of correctly predicted positive instances out of all actual positive instances. It indicates how well the model captures the positive instances. A recall score of 1.00 means that the model successfully identified all positive instances.

### Model 2

>>>> SVD

In [19]:
# Load the data into Surprise Dataset format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(clean_df[['id', 'rating', 'Rank']], reader)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train the model
model2 = SVD(random_state=42)
model2.fit(trainset)

# Evaluate the model
predictions2 = model2.test(testset)
accuracy2 = sup_accuracy.rmse(predictions2)

NameError: name 'sup_accuracy' is not defined

RMSE of 44.0078 means that, on average, the predictions made by the model have an error of approximately 44.0078 units. The RMSE gives you an idea of how well your model's predictions align with the true values. The lower the RMSE, the better the model's performance.

To further evaluate the significance of the RMSE value, it's important to consider the scale and context of your specific problem. Additionally, comparing the RMSE to the range of the target variable can provide insights into the relative performance of the model.

In [None]:
for prediction in predictions2:
    print(f"Predicted rating: {prediction.est:.2f}")
    print(f"Actual rating: {prediction.r_ui:.2f}")
    print("---")

In the code below, we will iterate over the predictions and increment the corresponding counters based on the predicted ratings and actual ratings. Then, we calculate precision by dividing the number of true positives by the sum of true positives and false positives. Recall is calculated by dividing the number of true positives by the sum of true positives and false negatives.

Note that this calculation assumes a binary classification problem where ratings above the threshold are considered positive and ratings below the threshold are considered negative. 

In [None]:
threshold = 4  # Define the threshold for positive predictions

true_positives = 0
false_positives = 0
false_negatives = 0

for prediction in predictions:
    if prediction.est >= threshold:
        if prediction.r_ui >= threshold:
            true_positives += 1
        else:
            false_positives += 1
    elif prediction.r_ui >= threshold:
        false_negatives += 1

precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")


Precision measures the proportion of correctly predicted positive instances out of all instances predicted as positive. It indicates how accurate the model is when it predicts positive instances. A precision score of 0.77 means that 77% of the instances predicted as positive were actually positive.

Recall, also known as sensitivity or true positive rate, measures the proportion of correctly predicted positive instances out of all actual positive instances. It indicates how well the model captures the positive instances. A recall score of 1.00 means that the model successfully identified all positive instances.

### Model 3

>>>> KNNBasic

In [None]:
# Load the data into Surprise Dataset format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(clean_df[['id', 'rating', 'regional_rating']], reader)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train the model
model3 = KNNBasic(random_state=42)
model3.fit(trainset)

# Evaluate the model
predictions3 = model3.test(testset)
accuracy3 = sup_accuracy.rmse(predictions3)

threshold = 3  # Define the threshold for positive predictions

true_positives = 0
false_positives = 0
false_negatives = 0

for prediction in predictions3:
    if prediction.est >= threshold:
        if prediction.r_ui >= threshold:
            true_positives += 1
        else:
            false_positives += 1
    elif prediction.r_ui >= threshold:
        false_negatives += 1

precision3 = true_positives / (true_positives + false_positives)
recall3 = true_positives / (true_positives + false_negatives)

print(f"Precision: {precision3:.2f}")
print(f"Recall: {recall3:.2f}")

the RMSE value suggests that the model's predictions have an average deviation of 67.3796 from the actual ratings.
A precision value of 0.70 means that out of all the recommendations predicted as positive by the model, 70% of them are actually relevant or accurate.
A recall value of 1.00 means that out of all the actual positive recommendations, the model is able to identify and predict 100% of them accurately.

#### Model 4

>>>> SVD

In [None]:
# Load the data into Surprise Dataset format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(clean_df[['id', 'rating', 'regional_rating']], reader)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train the model
model4 = SVD(random_state=42)
model4.fit(trainset)

# Evaluate the model
predictions4 = model4.test(testset)
accuracy4 = sup_accuracy.rmse(predictions4)

threshold = 3  # Define the threshold for positive predictions

true_positives = 0
false_positives = 0
false_negatives = 0

for prediction in predictions4:
    if prediction.est >= threshold:
        if prediction.r_ui >= threshold:
            true_positives += 1
        else:
            false_positives += 1
    elif prediction.r_ui >= threshold:
        false_negatives += 1

precision4 = true_positives / (true_positives + false_positives)
recall4 = true_positives / (true_positives + false_negatives)

print(f"Precision: {precision4:.2f}")
print(f"Recall: {recall4:.2f}")

The RMSE value suggests that the model's predictions have an average deviation of ---- from the actual ratings. A precision value of ---- means that out of all the recommendations predicted as positive by the model, ----% of them are actually relevant or accurate. A recall value of ---- means that the model is able to identify and predict all of the actual positive recommendations accurately.

#### Model 5

>>>> NMF

In [None]:
# Load the data into Surprise Dataset format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(clean_df[['id', 'rating', 'regional_rating']], reader)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train the model
model5 = NMF(random_state=42)
model5.fit(trainset)

# Evaluate the model
predictions5 = model5.test(testset)
accuracy5 = sup_accuracy.rmse(predictions5)

threshold = 3  # Define the threshold for positive predictions

true_positives = 0
false_positives = 0
false_negatives = 0

for prediction in predictions5:
    if prediction.est >= threshold:
        if prediction.r_ui >= threshold:
            true_positives += 1
        else:
            false_positives += 1
    elif prediction.r_ui >= threshold:
        false_negatives += 1

precision5 = true_positives / (true_positives + false_positives)
recall5 = true_positives / (true_positives + false_negatives)

print(f"Precision: {precision5:.2f}")
print(f"Recall: {recall5:.2f}")

The RMSE value suggests that the model's predictions have an average deviation of ---- from the actual ratings. A precision value of ---- means that out of all the recommendations predicted as positive by the model, ----% of them are actually relevant or accurate. A recall value of ---- means that the model is able to identify and predict all of the actual positive recommendations accurately.

#### Model 6

>>>> KNNWithMeans

In [None]:
# model with KNNwithMeans
# Load the data into Surprise Dataset format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(clean_df[['id', 'subcategories', 'rating']], reader)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Define the item-based collaborative filtering model
model6 = KNNWithMeans(sim_options={'user_based': False})

# Train the model
model6.fit(trainset)

# Make predictions on the test set
predictions6 = model6.test(testset)

# Evaluate the model using RMSE
rmse_score6 = sup_accuracy.rmse(predictions6)
#print("RMSE:", rmse_score6)

The root mean squared error (RMSE) for the predictions on the test set is 0.7981. RMSE is a measure of the difference between the predicted ratings and the actual ratings, with lower values indicating better performance.

#### Model 7

In [None]:
# Load the data into Surprise Dataset format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(clean_df[['id', 'rating', 'regional_rating']], reader)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train the model
model7 = KNNBasic(random_state=42)
model7.fit(trainset)

# Evaluate the model
predictions7 = model7.test(testset)

# Extract the actual ratings and predicted ratings from the predictions
actual_ratings = [pred.r_ui for pred in predictions7]
predicted_ratings = [pred.est for pred in predictions7]

# Calculate the R-squared value
r_squared = r2_score(actual_ratings, predicted_ratings)

# Calculate the R-squared value using Surprise's accuracy module
# r_squared = accuracy.rsquared(predictions)
# Print the R-squared value
print("R-squared:", r_squared) 

The R-squared value of -0.0697 suggests that the model's predictions do not explain much of the variance in the ratings. A negative R-squared value indicates that the model performs worse than a horizontal line (a model that predicts the average rating for all items). You may need to investigate further and consider other evaluation metrics to assess the performance of your recommendation model.

We will check the other models performance based on the r squared metric to see how they performed. 

### Evaluation

In [None]:
# Extract the actual ratings and predicted ratings from the predictions
actual_ratings1 = [pred.r_ui for pred in predictions1]
predicted_ratings1 = [pred.est for pred in predictions1]

# Extract the actual ratings and predicted ratings from the predictions
actual_ratings2 = [pred.r_ui for pred in predictions2]
predicted_ratings2 = [pred.est for pred in predictions2]

# Extract the actual ratings and predicted ratings from the predictions
actual_ratings3 = [pred.r_ui for pred in predictions3]
predicted_ratings3 = [pred.est for pred in predictions3]

# Extract the actual ratings and predicted ratings from the predictions
actual_ratings4 = [pred.r_ui for pred in predictions4]
predicted_ratings4 = [pred.est for pred in predictions4]

# Extract the actual ratings and predicted ratings from the predictions
actual_ratings5 = [pred.r_ui for pred in predictions5]
predicted_ratings5 = [pred.est for pred in predictions5]

# Extract the actual ratings and predicted ratings from the predictions
actual_ratings6 = [pred.r_ui for pred in predictions6]
predicted_ratings6 = [pred.est for pred in predictions6]

# Extract the actual ratings and predicted ratings from the predictions
actual_ratings7 = [pred.r_ui for pred in predictions7]
predicted_ratings7 = [pred.est for pred in predictions7]

# List of predictions and corresponding names
prediction_sets = [
    (predictions1, "Predictions 1"),
    (predictions2, "Predictions 2"),
    (predictions3, "Predictions 3"),
    (predictions4, "Predictions 4"),
    (predictions5, "Predictions 5"),
    (predictions6, "Predictions 6"),
    (predictions7, "Predictions 7")
]

# Iterate over the prediction sets
for predictions, name in prediction_sets:
    # Extract the actual ratings and predicted ratings from the predictions
    actual_ratings = [pred.r_ui for pred in predictions]
    predicted_ratings = [pred.est for pred in predictions]

    # Print the results
    print("Results for", name)
    print("Actual Ratings:", actual_ratings)
    print("Predicted Ratings:", predicted_ratings)
    print()

In [None]:
# List of actual ratings and predicted ratings
actual_ratings_list = [actual_ratings1, actual_ratings2, actual_ratings3, actual_ratings4, actual_ratings5, actual_ratings6, actual_ratings7 ]
predicted_ratings_list = [predicted_ratings1, predicted_ratings2, predicted_ratings3, predicted_ratings4, predicted_ratings5, predicted_ratings6, predicted_ratings7]

# Loop through the ratings lists
for i in range(len(actual_ratings_list)):
    actual_ratings = actual_ratings_list[i]
    predicted_ratings = predicted_ratings_list[i]
    
    # Calculate the R-squared value
    r_squared = r2_score(actual_ratings, predicted_ratings)
    
    # Print the R-squared value
    print(f"R-squared for Set {i+1}: {r_squared}")

R-squared measures the proportion of the variance in the dependent variable that is predictable from the independent variables. A higher value indicates a better fit of the model to the data. In this case, the R-squared values are negative, which suggests that the model does not fit the data well and may not be providing meaningful predictions.

#### Unclear Modelling section
Seek clarification (IAN)

In [None]:
# creating a relevant columns from the above dataset 
vectorization_columns = clean_df[['name', 'subcategories', 'amenities']]
vectorization_columns

In [None]:
# Convert relevant data into a list of strings
documents = []
for _, row in vectorization_columns.iterrows():
    name = row['name']
    subcategories = row['subcategories']
    amenities = row['amenities']
    doc = f"{name} {subcategories} {amenities}"
    documents.append(doc)

# Apply TF-IDF vectorization
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(documents)

In [None]:
# Compute cosine similarity matrix
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
def get_item_recommendations(item_index, cosine_similarities, top_n=5):
    # Get similarity scores for the item
    item_scores = list(enumerate(cosine_similarities[item_index]))

    # Sort items based on similarity scores
    item_scores = sorted(item_scores, key=lambda x: x[1], reverse=True)

    # Get top-N similar items
    top_items = item_scores[1 : top_n + 1]  # Exclude the item itself

    return top_items

# Get recommendations for a specific item (e.g., item with index 0)
item_index = 0
recommendations = get_item_recommendations(item_index, cosine_similarities)

# Print the top 5 recommendations
for item_id, similarity in recommendations:
    print(f"Item ID: {item_id}, Similarity: {similarity}")

#### Model 8 (Part of Unclear section)

In [None]:
# Construct the TF-IDF Matrix
tfidfv2=TfidfVectorizer(analyzer='word', stop_words='english')
tfidfv_matrix2=tfidfv2.fit_transform(clean_df['amenities'])
print(tfidfv_matrix2.todense())
tfidfv_matrix2.todense().shape

In [None]:
# Calculate similarity matrix
cosine_sim2 = cosine_similarity(tfidfv_matrix2, tfidfv_matrix2)

In [None]:
# Create a Pandas Series to map movie titles to their indices
indices = pd.Series(data = list(clean_df.index), index = clean_df['name'])
indices

In [None]:
def recommend_place(name, cosine_sim2, data):
    # Create a dictionary to map movie titles to their indices
    indices = {title: index for index, title in enumerate(clean_df['name'])}

    # Get the index of the movie that matches the title
    idx = indices[name]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim2[idx]))

    # Sort the movies based on the similarity scores
    sim_scores.sort(key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    indices = [x for x, _ in sim_scores]

    # Return the top 10 most similar movies
    recommended_place = clean_df.iloc[indices]['name']
    return recommended_place

In [None]:
recommend_place("St. Catherine's Monastery Guesthouse", cosine_sim2, clean_df)

In [None]:
def recommend_amenities(amenity, cosine_sim2, data):
    # Create a dictionary to map movie titles to their indices
    indices = {title: index for index, title in enumerate(clean_df['amenities'])}

    # Get the index of the movie that matches the title
    idx = indices[amenities]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim2[idx]))

    # Sort the movies based on the similarity scores
    sim_scores.sort(key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    indices = [x for x, _ in sim_scores]

    # Return the top 10 most similar movies
    recommended_amenities = clean_df.iloc[indices]['amenities']
    return recommended_amenities

In [None]:
def recommend_amenities(amenity, cosine_sim2, data):
    # Create a dictionary to map amenity titles to their indices
    indices = {title: index for index, title in enumerate(clean_df['amenities'])}

    # Get the index of the amenity that matches the title
    idx = indices[amenity]

    # Get the pairwise similarity scores of all amenities with that amenity
    sim_scores = list(enumerate(cosine_sim2[idx]))

    # Sort the amenities based on the similarity scores
    sim_scores.sort(key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar amenities
    sim_scores = sim_scores[1:11]

    # Get the amenity indices
    indices = [x for x, _ in sim_scores]

    # Return the top 10 most similar amenities
    recommended_amenities = clean_df.iloc[indices]['amenities']
    return recommended_amenities

# Evaluation
def evaluate_recommendation_system(test_set, cosine_sim, data):
    # Initialize evaluation metrics
    accuracy = 0
    rmse = 0
    precision = 0
    recall = 0
    total_test_cases = len(test_set)

    # Iterate over each test case
    for test_case in test_set:
        ground_truth_amenity = test_case['ground_truth_amenity']
        predicted_amenities = recommend_amenities(ground_truth_amenity, cosine_sim, data)
        
        # Evaluate metrics for the current test case
        # Compare predicted_amenities with ground truth amenities
        # Calculate accuracy, RMSE, precision, and recall

    # Calculate average metrics
    accuracy /= total_test_cases
    rmse = np.sqrt(rmse / total_test_cases)
    precision /= total_test_cases
    recall /= total_test_cases

    # Return the evaluation metrics
    return accuracy, rmse, precision, recall

# Test set with ground truth amenities
test_set = [
    {'ground_truth_amenity': 'Restaurant'},
    {'ground_truth_amenity': 'Pool'},
    # Add more test cases here...
]

# Calculate evaluation metrics
accuracy, rmse, precision, recall = evaluate_recommendation_system(test_set, cosine_sim2, data)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("RMSE:", rmse)
print("Precision:", precision)
print("Recall:", recall)

#### End of unclear section

## Model Selection

In [None]:
class RecommenderSystem:
    def __init__(self, clean_df, tfidfv_matrix2, cosine_sim2, cosine_similarities, indices):
        self.clean_df = clean_df
        self.tfidfv_matrix2 = tfidfv_matrix2
        self.cosine_sim2 = cosine_sim2
        self.cosine_similarities = cosine_similarities
        self.indices = indices

    def recommend_attraction(self, rating_threshold):
        # Filter the DataFrame based on the rating threshold
        recommendations = self.clean_df[self.clean_df['rating'] > rating_threshold][['name', 'LowerPrice', 'UpperPrice','amenities', 'type', 'country']]

        # Reset the index of the recommendations DataFrame
        recommendations.reset_index(drop=True, inplace=True)

        return recommendations

    def recommend_amenities(self, query):
        # Check if the specified amenity exists in the dataset
        if query not in self.clean_df['amenities'].str.join(', '):
            st.error(f"Error: '{query}' does not exist in the dataset.")
            return None

        # Convert the string representation of amenities back into a list
        self.clean_df['amenities'] = self.clean_df['amenities'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

        # Get the index of the specified amenity
        indices = self.clean_df['amenities'].apply(lambda x: query in x if isinstance(x, list) else False)

        # Get the pairwise similarity scores of all items with the specified amenity
        sim_scores = self.cosine_sim2[indices]

        # Flatten the similarity scores
        sim_scores = sim_scores.flatten()

        # Get the indices of the sorted similarity scores
        indices = np.argsort(sim_scores)[::-1]

        # Get the sorted similarity scores
        sim_scores = sim_scores[indices]

        # Get the recommended items
        recommended_items = self.clean_df.iloc[indices]

        return recommended_items

    def recommend_place(self, name):
        # Create a dictionary to map place names to their indices
        indices = {title: index for index, title in enumerate(self.clean_df['name'])}

        # Check if the specified place exists in the dataset
        if name not in indices:
            st.error(f"Error: '{name}' does not exist in the dataset.")
            return None

        # Get the index of the specified place
        idx = indices[name]

        # Get the pairwise similarity scores of all places with the specified place
        sim_scores = list(enumerate(self.cosine_similarities[idx]))

        # Sort the places based on the similarity scores
        sim_scores.sort(key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar places
        sim_scores = sim_scores[1:11]

        # Get the indices of the top-N similar places
        indices = [x for x, _ in sim_scores]

        # Get the recommended places
        recommended_places = self.clean_df.iloc[indices]['name']

        return recommended_places

    def get_item_recommendations(self, item_index, top_n=5):
        # Get similarity scores for the item
        item_scores = list(enumerate(self.cosine_similarities[item_index]))

        # Sort items based on similarity scores
        item_scores = sorted(item_scores, key=lambda x: x[1], reverse=True)

        # Get top-N similar items
        top_items = item_scores[1:top_n + 1]  # Exclude the item itself

        return top_items

In [None]:
# Needs clarification (IAN)
hybrid = RecommenderSystem('clean_df', 'tfidfv_matrix2', 'cosine_sim2', 'cosine_similarities', 'indices')

In [None]:
# Needs Clarification (IAN)
recommend_place('Excalibur Boutique Hotel', cosine_sim2, clean_df)

## Tuning

## Deployment

In [None]:
#pickling model

## Conclusion and Recommendations