In [0]:
pip install nltk


In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import cross_validate as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_absolute_error

In [0]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

def mae(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return mean_absolute_error(prediction, ground_truth)


def collaborativeFiltering(reviews,reviews_source):
    
   # reviews = pd.read_csv(reviews_source)
    reviews = reviews.head(20000)
   # if reviews_source == 'reviews_restaurants_text.csv':
        #reviews.drop('Unnamed: 0',axis='columns', inplace=True)
    reviews['text'] = reviews['text'].str[2:-2]

    
    print("Undersampling of the dataset started--------")
    
    #Undersampling of the dataset to get a balanced dataset
    review1 = reviews[reviews['stars'] == 1][0:500]
    review2 = reviews[reviews['stars'] == 2][0:500]
    review3 = reviews[reviews['stars'] == 3][0:500]
    review4 = reviews[reviews['stars'] == 4][0:500]
    review5 = reviews[reviews['stars'] == 5][0:500]
    frames = [review1, review2, review3,review4,review5]
    reviews = pd.concat(frames)
    
    print("Undersampling of the dataset completed--------")
    y=reviews[['business_id']]
    # converting user_id and business_id to integers for the matrix
    reviews['user_id'] = pd.factorize(reviews.user_id)[0]
    reviews['business_id'] = pd.factorize(reviews.business_id)[0]
    y["value"]=reviews['business_id']
    
    # getting the number unique users and restaurants
    unique_users = reviews.user_id.unique().shape[0]
    unique_restaurants = reviews.business_id.unique().shape[0]
    
    #splitting the dataset
    from sklearn.model_selection import train_test_split
    train_data, test_data = train_test_split(reviews, test_size=0.20)

    #Create two user-item matrices, one for training and another for testing
    train_data_matrix = np.zeros((unique_users, unique_restaurants))
    
    print("Creation of user-item matrix started--------")
    
    # train_data_matrix
    for line in train_data.itertuples():
         train_data_matrix[line[4], line[2]] = line[6]
            
    # test_data_matrix
    test_data_matrix = np.zeros((unique_users, unique_restaurants))
    for line in test_data.itertuples():
        test_data_matrix[line[4], line[2]] = line[6]
    
    print("Creation of user-item matrix completed--------")
    
    print("Creation of similarity matrix started--------")
    
    # calculating similarity between users
    user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
    # calculating similarity between items
    item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
    
    print("Creation of similarity matrix completed--------")
    
    
    print("Creation of prediction matrix started--------")
    
    item_prediction = predict(train_data_matrix, item_similarity, type='item')
    user_prediction = predict(train_data_matrix, user_similarity, type='user')
    print(item_prediction)
    
    print("Creation of prediction matrix completed--------")
    
    print('Printing the RMSE and MAE------------' + '\n')
    
    if reviews_source == 'reviews_restaurants_text.csv':
        rating_type = 'biased rating'
    elif reviews_source == 'reviews_restaurants_text_LinearSVM.csv':
        rating_type = 'unbiased rating from Linear SVM'
    else:
        rating_type = 'unbiased rating from Naive Bayes'
    print ('Root Mean Square Error while testing the model using ' + rating_type)
    print ('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
    print ('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)) + '\n')

    print ('Root Mean Square Error while training the model using ' + rating_type)
    print ('User-based CF RMSE: ' + str(rmse(user_prediction, train_data_matrix)))
    print ('Item-based CF RMSE: ' + str(rmse(item_prediction, train_data_matrix)) + '\n')
    
    print ('Mean Absolute Error while testing the model using ' + rating_type)
    print ('User-based CF MAE: ' + str(mae(user_prediction, test_data_matrix)))
    print ('Item-based CF MAE: ' + str(mae(item_prediction, test_data_matrix)) + '\n')

    print ('Mean Absolute Error while training the model using ' + rating_type)
    print ('User-based CF MAE: ' + str(mae(user_prediction, train_data_matrix)))
    print ('Item-based CF MAE: ' + str(mae(item_prediction, train_data_matrix)) + '\n')   
    return user_prediction,y

In [0]:
df4=pd.read_csv('https://raw.githubusercontent.com/SivaSaiRam143/BDA_PROJECT/main/shortened.csv')
df4=df4[['review_id','business_id','business_name','user_id','text','stars']]
df4 = df4[df4['text'].notna()]
df4.head()
x,y=collaborativeFiltering(df4,'reviews_restaurants_text.csv')

In [0]:
df4.head()

Unnamed: 0,review_id,business_id,user_id,text,stars
0,fWKvX83p0-ka4JS3dc6E5A,9yKzy9PApeiPPOUJEtnvkg,rLtl8ZkDX5vH5nAx9C3q5Q,wife took me here on my birthday for breakfas...,5
1,IjZ33sJrzXqU-0X6U8NwyA,ZRJwVLyzEJq1VAihDhYiow,0a2KyEL0d3Yb1V6aivbIuQ,have no idea why some people give bad reviews ...,5
2,IESLBzqUCLdSzSqm0eCSxQ,6oRAC4uyJCsJl1X0WZpVSA,0hT2KtfLiobPvh6cDC8JQg,ve the gyro plate. Rice is so good and I also ...,4
3,G-WvGaISbqqaMHlNnByodA,_1QQZuf4zZOyFCvXc0o6Vg,uZetl9T0NcROGOyFfughhg,"sie, Dakota, and I LOVE Chaparral Dog Park!!! ...",5
4,1uJFq2r5QfJG_6ExMRCaGw,6ozycU1RpktNG2-1BroVtw,vYmM4KTsC8ZfQBg-j5MWkw,neral Manager Scott Petello is a good egg!!! N...,5


In [0]:
row=x[2]
col = [None]*len(row)
for i in range(len(row)) :
    col[i]=i
for j in range(len(row)):
    for i in range(len(row)-1):
        if(row[i]<row[i+1]):
            t=row[i]
            row[i]=row[i+1]
            row[i+1]=t
            t=col[i]
            col[i]=col[i+1]
            col[i+1]=t
col=col[0:10]
ans=[]
for i in col :
    for j in y.itertuples():
        if j[2]== i:
            ans.append(j[1])
res = np.array(ans)
ansset=set()
for i in df4.itertuples():
    if i[2] in res:
        ansset.add(i[3])
print("Here are your top 10 recommended restaurants using collaborative filtering")
for i in ansset:
    print(i)

In [0]:
df3=pd.read_csv('https://raw.githubusercontent.com/SivaSaiRam143/BDA_PROJECT/main/reviews_restaurants_text_unbiased_svm.csv')
df3.head()
#x,y=collaborativeFiltering(df3,'reviews_restaurants_text_LinearSVM.csv')

Unnamed: 0,review_id,business_id,user_id,text,stars
0,fWKvX83p0-ka4JS3dc6E5A,9yKzy9PApeiPPOUJEtnvkg,rLtl8ZkDX5vH5nAx9C3q5Q,My wife took me here on my birthday for breakf...,5
1,IjZ33sJrzXqU-0X6U8NwyA,ZRJwVLyzEJq1VAihDhYiow,0a2KyEL0d3Yb1V6aivbIuQ,I have no idea why some people give bad review...,5
2,IESLBzqUCLdSzSqm0eCSxQ,6oRAC4uyJCsJl1X0WZpVSA,0hT2KtfLiobPvh6cDC8JQg,love the gyro plate. Rice is so good and I als...,4
3,G-WvGaISbqqaMHlNnByodA,_1QQZuf4zZOyFCvXc0o6Vg,uZetl9T0NcROGOyFfughhg,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5
4,1uJFq2r5QfJG_6ExMRCaGw,6ozycU1RpktNG2-1BroVtw,vYmM4KTsC8ZfQBg-j5MWkw,General Manager Scott Petello is a good egg!!!...,5
