In [3]:
from surprise import Dataset
import os
from surprise import Reader
from surprise import SVD  # SVD stands for Singular Value Decomposition, also an MF-based method like ALS
from surprise.model_selection import train_test_split
from surprise import accuracy
import pandas as pd
import numpy as np

In [4]:
folder_path = r'D:\Datasets'

review_file = 'yelp_academic_dataset_review.csv'
business_file = 'yelp_academic_dataset_business.csv'

review_path = os.path.join(folder_path, review_file)
business_path = os.path.join(folder_path, business_file)

#Loading the Review CSV file into a DataFrame
df_review = pd.read_csv(review_path, usecols=['user_id', 'business_id', 'stars'], nrows = 10000)

#Loading the Business CSV file into another DataFrame
df_business = pd.read_csv(business_path, usecols=['business_id', 'name'], nrows = 10000)

#Merging the two DataFrames based on the 'business_id' column
merged_df = pd.merge(df_review, df_business, on='business_id', how='inner')

new_file = 'business_reviews.csv'

file_path = os.path.join(folder_path, new_file)

#Saving the merged DataFrame to a new business_reviews CSV file
merged_df.to_csv(file_path, index=False)

In [5]:
#Loading the new CSV file into a Pandas DataFrame
data = pd.read_csv(file_path)

In [6]:
data.head

<bound method NDFrame.head of                      user_id             business_id  stars  \
0     mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw    3.0   
1     Iaee7y6zdSB3B-kRCo4z1w  XQfwVwDr-v0ZS3_CbbE5Xw    2.0   
2     ejFxLGqQcWNLdNByJlIhnQ  XQfwVwDr-v0ZS3_CbbE5Xw    4.0   
3     f7xa0p_1V9lx53iIGN5Sug  XQfwVwDr-v0ZS3_CbbE5Xw    3.0   
4     _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA    5.0   
...                      ...                     ...    ...   
6767  k9Mvnw4aglONCaS5cBKQnw  yqBU6jO9e988dwxBkstJFQ    5.0   
6768  RkmOFFHv1vharP_8ARd-Qg  iFzxAYaZYVNr9OLaek1cYg    5.0   
6769  rUs9xSR1LASkh6_grbQlHg  F_KRsGlJJSS7_N2xOErJDw    1.0   
6770  a0REzCBCztw5pS0_BOTa6A  hxsdgbsEQwWUZq91Mx9MCw    3.0   
6771  wDNDsaLICPKLFATB41qZ6g  K_L_w0Hv47TLlpNJ2R14zA    4.0   

                                     name  
0            Turning Point of North Wales  
1            Turning Point of North Wales  
2            Turning Point of North Wales  
3            Turning Point of North W

In [7]:
print(data.columns)

Index(['user_id', 'business_id', 'stars', 'name'], dtype='object')


# Defining a Reader object specifying the rating scale

In [8]:
reader = Reader(rating_scale=(1, 5))  #the rating scale we have is from 1 to 5

In [9]:
#Loading the dataset from the DataFrame
data_loading = Dataset.load_from_df(data[['user_id', 'business_id','stars']], reader)

# Splitting the dataset and Training the SVD model

In [10]:
#Splitting the data into training and testing sets
trainset, testset = train_test_split(data_loading, test_size=0.2)  # You can adjust the test_size

In [11]:
#Defining the SVD model
model = SVD()

#Training the model on the training set
model.fit(trainset)

#Making the predictions on the test set
predictions = model.test(testset)

# Performing Cosine Similiarity

In [12]:
#doing predictive rating and actual ratings

predicted_ratings = [pred.est for pred in predictions]
actual_ratings = [pred.r_ui for pred in predictions]

In [13]:
#Calculating Cosine Similarity between precitive rating and actual rating

from sklearn.metrics.pairwise import cosine_similarity

#Converting the lists to NumPy arrays
predicted_ratings_array = np.array([predicted_ratings])
actual_ratings_array = np.array([actual_ratings])

#Reshaping arrays
predicted_ratings_array = predicted_ratings_array.reshape(1, -1)
actual_ratings_array = actual_ratings_array.reshape(1, -1)

#Calculating cosine similarity
similarity_score = cosine_similarity(predicted_ratings_array, actual_ratings_array)

In [14]:
print(f"Cosine Similarity Score: {similarity_score[0, 0]}")

Cosine Similarity Score: 0.946365684726501


# Making Recommendations for a specific user

In [15]:
user_id = 'vI4vyi1dfG93oAiSRFDymA'
user_recommendations = []

In [16]:
data_loading = Dataset.load_from_df(data[['user_id', 'business_id', 'stars']], reader)

#Fetching businesses that the user has not rated yet
rated_items = data_loading.df[data_loading.df['user_id'] == user_id]['business_id']
user_recommendations = []

for business_id in data_loading.df['business_id'].unique():
    if business_id not in rated_items.values:
        predicted_rating = model.predict(user_id, business_id).est
        user_recommendations.append((business_id, predicted_rating))

#Sorting the recommendations by predicted rating in descending order
user_recommendations.sort(key=lambda x: x[1], reverse=True)

#Displaying top 20 recommendations for the selected user
top_n = 20
print(f"Top {top_n} Recommendations for User {user_id}:")
for idx, (business_id, predicted_rating) in enumerate(user_recommendations[:top_n], 1):
    #Looking up 'name' from the 'data' DataFrame
    business_name = data[data['business_id'] == business_id]['name'].values[0]
    print(f"{idx}. Business ID: {business_id}, Business Name: {business_name}, Predicted Rating: {predicted_rating}")

Top 20 Recommendations for User vI4vyi1dfG93oAiSRFDymA:
1. Business ID: AzseSGgDC6bVtMPEYo1CNQ, Business Name: Creole Creamery, Predicted Rating: 4.554330850060632
2. Business ID: 2KIDQyTh-HzLxOUEDqtDBg, Business Name: Mazzaro's Italian Market, Predicted Rating: 4.442104240941289
3. Business ID: TV81bpCQ6p6o4Hau5hk-zw, Business Name: Hellas Restaurant, Predicted Rating: 4.4377739087399215
4. Business ID: 8uF-bhJFgT4Tn6DTb27viA, Business Name: District Donuts Sliders Brew, Predicted Rating: 4.424515531047731
5. Business ID: qjGS_7iaQDpbVhS6W8qkHQ, Business Name: The Sweet Life Bakeshop, Predicted Rating: 4.413169869405828
6. Business ID: vN6v8m4DO45Z4pp8yxxF_w, Business Name: Surrey's Café & Juice Bar, Predicted Rating: 4.3933199437755315
7. Business ID: jQBPO3rYkNwIaOdQS5ktgQ, Business Name: The Fountain On Locust, Predicted Rating: 4.390324696999404
8. Business ID: bjsBMTS4RD7Bs35ugv_fPA, Business Name: The Eagle Inn, Predicted Rating: 4.38798588479284
9. Business ID: gmjsEdUsKpj9Xxu6