In [1]:
import pandas as pd
from scipy import stats
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



In [2]:
#Loading the dataset
hotels_df = pd.read_csv('Raw/Hotels_in_Canada copy.csv')

reviews_df = pd.read_csv('Raw/Hotels_in_Canada_Reviews copy.csv')


In [3]:
# Assuming 'ID' is the common attribute in both datasets
merged_df = hotels_df.merge(reviews_df, on='id', how='inner')
#print(merged_df.head())

In [4]:
# Identify and handle outliers in 'User rating' column using z-score
z_scores = stats.zscore(merged_df['user_rating'])
# Keep only rows with z-scores within 3 standard deviations
merged_df = merged_df[(z_scores < 3)] 

In [5]:
# Standardize the 'amenities' column by converting all text to lowercase
merged_df['amenities'] = merged_df['amenities'].str.lower()



In [6]:
# Remove duplicates and keep the first occurrence
merged_df.drop_duplicates(subset=['user_name', 'id'], keep='first', inplace=True)

# Create the User-Item Matrix
user_item_matrix = merged_df.pivot(index='user_name', columns='id', values='user_rating')

# Handle missing values by filling NaN with 0 (or any other appropriate imputation method)
user_item_matrix.fillna(0, inplace=True)


In [7]:

# Calculate similarity scores using cosine similarity
user_similarity_cosine = pd.DataFrame(cosine_similarity(user_item_matrix), index=user_item_matrix.index, columns=user_item_matrix.index)


  ret = a @ b


In [32]:
# print("user_item_matrix shape:", user_item_matrix.shape)
# print("user_item_matrix sample:")
# print(user_item_matrix.head())

# print("\nuser_item_sparse shape:", user_item_sparse.shape)
# print("user_item_sparse sample:")
# print(user_item_sparse[:5])  # Print the first 5 rows of the sparse matrix


In [8]:
# Calculate the neighborhood for each user
k_neighbors = 5
user_neighborhood = {}
for user in user_similarity_cosine.index:
    # Sort users based on similarity score in descending order
    similar_users = user_similarity_cosine.loc[user].sort_values(ascending=False)
    
    # Select the top k_neighbors users as the neighborhood or users above similarity threshold
    if k_neighbors is not None:
        neighborhood = similar_users.iloc[1:k_neighbors + 1].index 
    else:
        # Set a similarity threshold (e.g., 0.8) and include users above this threshold
        similarity_threshold = 0.5
        neighborhood = similar_users[similar_users > similarity_threshold].index
    
    user_neighborhood[user] = neighborhood

In [9]:
# Create a dictionary to store aggregated ratings for each user
user_aggregated_ratings = {}

# Iterate through each user and their selected neighborhood
for user, neighbors in user_neighborhood.items():
    # Get the similarity scores for the selected neighbors
    similarity_scores = user_similarity_cosine.loc[user, neighbors]
    
    # Get the ratings of hotels for the selected neighbors
    neighbor_ratings = user_item_matrix.loc[neighbors]
    
    # Calculate the weighted average ratings
    if similarity_scores.sum() !=0:
        weighted_avg_ratings = (similarity_scores.values.reshape(-1, 1) * neighbor_ratings.values).sum(axis=0) / similarity_scores.sum()
    else:
         weighted_avg_ratings = neighbor_ratings.sum(axis=0) / len(neighbors)  
         
        
    
    # Store the aggregated ratings in the dictionary
    user_aggregated_ratings[user] = weighted_avg_ratings

# Convert the dictionary to a DataFrame
aggregated_ratings_df = pd.DataFrame(user_aggregated_ratings, index=user_item_matrix.columns).T



In [10]:
# Print the aggregated ratings for a sample user
sample_user = '-foodie_watch-'  
print(aggregated_ratings_df.loc[sample_user])

id
1       0.0
2       0.0
3       0.0
4       0.0
5       0.0
       ... 
996     0.0
997     0.0
998     0.0
999     0.0
1000    0.0
Name: -foodie_watch-, Length: 625, dtype: float64


In [11]:
target_user = '-foodie_watch-'
target_neighborhood = user_neighborhood[target_user]
target_neighborhood_ratings = user_item_matrix.loc[target_neighborhood]



# Calculate the weighted average predicted ratings for unrated hotels
predicted_ratings = (user_similarity_cosine.loc[target_user, target_neighborhood]@ target_neighborhood_ratings.values) / user_similarity_cosine.loc[target_user, target_neighborhood].sum()

# Convert predicted_ratings to a Pandas Series 
predicted_ratings_series = pd.Series(predicted_ratings, index=target_neighborhood_ratings.columns)

# Get the hotels that the target user has already rated
rated_hotels = user_item_matrix.loc[target_user].dropna().index

# Filter out hotels that the target user has already rated
unrated_hotels = [hotel for hotel in predicted_ratings_series.index if hotel not in rated_hotels]

# Get the predicted ratings for unrated hotels
predicted_ratings_unrated = predicted_ratings_series.loc[unrated_hotels]

# Sort the hotels based on predicted ratings to get recommendations
top_recommendations = predicted_ratings_unrated.sort_values(ascending=False)


In [12]:

# Print the top-rated hotel recommendations
print("Top-rated hotel recommendations:")
print(top_recommendations)

print("Predicted ratings for unrated hotels:")
print(predicted_ratings_series)

Top-rated hotel recommendations:
Series([], dtype: float64)
Predicted ratings for unrated hotels:
id
1       0.0
2       0.0
3       0.0
4       0.0
5       0.0
       ... 
996     0.0
997     0.0
998     0.0
999     0.0
1000    0.0
Length: 625, dtype: float64
