### Import required libraries

In [1]:
import gzip
import math
import random
from collections import defaultdict

### Input File

In [2]:
##### select the path of your input file and input file name ######
#directory = "/home/sourav/"

directory = "C:/Users/soura/OneDrive/Documents/UCSD/CSE 258/"

path = directory + "goodreads_reviews_comics_graphic.json.gz"

##### read the data into a list ###### 
dataset = []
for i in gzip.open(path, 'rt', encoding="utf8"):
    dataset.append(eval(i))

In [3]:
#### see a sample of the dataset ####
dataset[1]

{'user_id': 'bafc2d50014200cda7cb2b6acd60cd73',
 'book_id': '6315584',
 'review_id': '72f1229aba5a88f9e72f0dcdc007dd22',
 'rating': 4,
 'review_text': "I've never really liked Spider-Man. I am, however, a huge fan of the Dresden Files. Jim Butcher is clever and sarcastic and probably the perfect choice to pen a superhero novel. I really enjoyed this book!",
 'date_added': 'Wed Aug 10 06:06:48 -0700 2016',
 'date_updated': 'Fri Aug 12 08:49:54 -0700 2016',
 'read_at': 'Fri Aug 12 08:49:54 -0700 2016',
 'started_at': 'Wed Aug 10 00:00:00 -0700 2016',
 'n_votes': 0,
 'n_comments': 0}

### Extract useful data structures

In [4]:
users_per_book = defaultdict(set) # Maps an item to the users who rated it
books_per_user = defaultdict(set) # Maps a user to the items that they rated
rating_dict = {} # To retrieve a rating for a specific user/item pair

for d in dataset:
    user,item = d['user_id'], d['book_id']
    users_per_book[item].add(user)
    books_per_user[user].add(item)
    rating_dict[(user,item)] = d['rating']

### Average rating calculation 

In [5]:
user_averages = {}
book_averages = {}

#### calculate average rating given by each user ####
for u in books_per_user:
    rs = [rating_dict[(u,i)] for i in books_per_user[u]]
    user_averages[u] = sum(rs) / len(rs)

#### calculate average rating for each book ####
for i in users_per_book:
    rs = [rating_dict[(u,i)] for u in users_per_book[i]]
    book_averages[i] = sum(rs) / len(rs)

### Similarity Metrics

In [6]:
#### define jaccard similarity ####
def jaccard_sim(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

#### define cosine similarity ####
def cosine_sim(i1, i2):
    inter = users_per_book[i1].intersection(users_per_book[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += rating_dict[(u,i1)]*rating_dict[(u,i2)]
    for u in users_per_book[i1]:
        denom1 += rating_dict[(u,i1)]**2
    for u in users_per_book[i2]:
        denom2 += rating_dict[(u,i2)]**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: 
        return 0
    return numer / denom

#### define pearson similarity ####
def pearson_sim(i1, i2):
    iBar1 = book_averages[i1]
    iBar2 = book_averages[i2]
    inter = users_per_book[i1].intersection(users_per_book[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += (rating_dict[(u,i1)] - iBar1)*(rating_dict[(u,i2)] - iBar2)
    for u in users_per_book[i1]:
        denom1 += (rating_dict[(u,i1)] - iBar1)**2
    for u in users_per_book[i2]:
        denom2 += (rating_dict[(u,i2)] - iBar2)**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: 
        return 0
    return numer / denom


### Function to find books or users

In [7]:
#### using jaccard similarity ####
def most_similar(i, N):
    similar = []
    users = users_per_book[i]
    Items = set()
    for u in users:
        Items = Items.union(books_per_user[u])
    for i2 in Items:
        if i2 == i: continue
        sim = jaccard_sim(users, users_per_book[i2])
        similar.append((sim,i2))
    similar.sort(reverse=True)
    return similar[:N]

### Rating prediction based on similarity

In [8]:
average_rating = sum([data['rating'] for data in dataset]) / len(dataset)

In [9]:
labels = [data['rating'] for data in dataset]

In [10]:
detailsPerUser = defaultdict(list)
detailsPerItem = defaultdict(list)

for d in dataset:
    user,item = d['user_id'], d['book_id']
    detailsPerUser[user].append(d)
    detailsPerItem[item].append(d)

In [11]:
def predict_rating(user,book):
    ratings = []
    similarities = []
    for d in detailsPerUser[user]:
        i2 = d['book_id']
        if i2 == book: 
            continue
        ratings.append(d['rating'] - book_averages[i2])
        similarities.append(jaccard_sim(users_per_book[book],users_per_book[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return book_averages[book] + sum(weightedRatings) / sum(similarities)
    else:
        return book_averages[book]

### Function to calculate mean squared error

In [12]:
def mse(prediction, actual):
    se = [(x-y)**2 for x,y in zip(prediction, actual)]
    return sum(se) / len(se)

In [13]:
trivial_prediction = [average_rating for d in dataset]

In [14]:
simPredictions = [predict_rating(data['user_id'], data['book_id']) for data in dataset]

In [15]:
print(f'The MSE of trivial prediction: {mse(trivial_prediction, labels)}\n')

print(f'The MSE of prediction based on item : {mse(simPredictions, labels)}')

The MSE of trivial prediction: 1.3309277970868938

The MSE of prediction based on item : 0.7908367015187353
