In [1]:
# Filter the big 9GB json and extract book names and other metadata + filter and deduplicate values

In [2]:
from IPython.display import display
import gzip
import json
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as metrics

from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation, cosine
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error
from math import sqrt
import sys, os
from contextlib import contextmanager

In [3]:
with gzip.open("inputs/goodreads_books.json.gz") as f:
    line = f.readline()

In [4]:
json.loads(line)

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [5]:
def get_needed_fields(json_line):
    return {
        "book_id": json_line["book_id"],
        "title": json_line["title"],
        "ratings_count": json_line["ratings_count"],
        "url": json_line["url"],
        "image_url": json_line["image_url"],
        "average_rating": json_line["average_rating"],
        "authors": json_line["authors"],
        "publisher": json_line["publisher"]
    }

In [6]:
get_needed_fields(json.loads(line))

{'book_id': '5333265',
 'title': 'W.C. Fields: A Life on Film',
 'ratings_count': '3',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'average_rating': '4.00',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press"}

In [7]:
parsed_books = []
with gzip.open("inputs/goodreads_books.json.gz", 'r') as f:
    while line := f.readline():
        needed_fields = get_needed_fields(json.loads(line))
        try:
            ratings_count = int(needed_fields["ratings_count"])
        except ValueError:
            continue
        if ratings_count > 1000:
            parsed_books.append(needed_fields)

In [8]:
# parsed_books = pd.read_json("inputs/goodreads_books.json.gz", lines=True)

In [9]:
books = pd.DataFrame.from_dict(parsed_books)

In [10]:
books["ratings_count"] = pd.to_numeric(books["ratings_count"])

In [11]:
books["title"] = books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)

In [12]:
books["title"] = books["title"].str.lower()

In [13]:
books["title"] = books["title"].str.replace("\s+", " ", regex=True)

In [14]:
books = books[books["title"].str.len() > 0]

In [15]:
# parsed_books = parsed_books[parsed_books["ratings_count"] > 1000]

In [16]:
books.to_json("inputs/parsed_booksv2.json")

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(books["title"])

In [18]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

def search(query,vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([query])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = books.iloc[indices]
    results = results.sort_values("ratings_count", ascending=False)
    
    return results.head(5).style.format({'url': make_clickable, 'cover_image': show_image})

In [19]:
search("the empty chair", vectorizer)

Unnamed: 0,book_id,title,ratings_count,url,image_url,average_rating,authors,publisher
47177,6976,the mermaid chair,63365,Goodreads,https://images.gr-assets.com/books/1388259308m/6976.jpg,3.1,"[{'author_id': '4711', 'role': ''}]",Penguin Books
45089,835202,a chair for my mother,20810,Goodreads,https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png,4.2,"[{'author_id': '48969', 'role': ''}]",Greenwillow Books
24042,142540,the empty chair lincoln rhyme 3,17048,Goodreads,https://images.gr-assets.com/books/1336331745m/142540.jpg,4.06,"[{'author_id': '1612', 'role': ''}]",Simon & Schuster
55597,7957361,half empty,4910,Goodreads,https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png,3.79,"[{'author_id': '5855', 'role': ''}]",
36892,7883678,empty,3659,Goodreads,https://images.gr-assets.com/books/1328837468m/7883678.jpg,3.28,"[{'author_id': '99836', 'role': ''}]",Scholastic Press


In [20]:
my_liked_ids = ["16096824", "17927395", "23766634", "10429045", "13104080", "13188676", "17331518", "7896527", "17167166", "20613470", "18006496", "28260587", "31450852", "227729", "13415554", "13419891","18333581", "13565676", "29008738", "395922", "395851", "1169808", "395875", "2373", "142540"]

In [21]:
liked_books = books[books['book_id'].isin(my_liked_ids)]

In [22]:
my_liked_books = liked_books[['book_id','title']].copy()

In [23]:
my_ratings=['5','3','3','4','5','5','5','4','4','5','4','5','5','3','5','5','4','4','5','5','4','5','5','5','5']

In [24]:
my_liked_books = my_liked_books.assign(rating=my_ratings)

In [25]:
my_liked_books = my_liked_books.assign(user_id='-1')

In [26]:
my_liked_books = my_liked_books[['user_id', 'book_id','rating','title']]

In [27]:
my_liked_books

Unnamed: 0,user_id,book_id,rating,title
6918,-1,13104080,5,unravel me shatter me 2
9675,-1,395875,3,dark reunion the vampire diaries 4
10817,-1,1169808,3,the fury the vampire diaries 3
15165,-1,10429045,4,shatter me shatter me 1
15749,-1,13565676,5,the assassin and the empire throne of glass 05
16027,-1,23766634,5,a court of wings and ruin a court of thorns an...
19845,-1,13419891,5,the assassin and the desert throne of glass 03
24042,-1,142540,4,the empty chair lincoln rhyme 3
30383,-1,395922,4,the awakening the vampire diaries 1
31585,-1,20613470,5,heir of fire throne of glass 3


In [28]:
#map book_id_map to goodreads_interactions

In [29]:
csv_book_mapping = {}
with open("inputs/book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [30]:
len(csv_book_mapping)

2360651

In [31]:
book_set = set(my_liked_books["book_id"])

In [32]:
users = {}

with open("inputs/goodreads_interactions.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")

        book_id = csv_book_mapping.get(csv_id)
        if book_id in book_set:
            if user_id not in users:
                users[user_id] = 1
            else:
                users[user_id] +=1 

In [33]:
len(users)

138882

In [34]:
#my_liked_books.shape[0]/5 = 5. verifica ca users[k]>5 adica ca fiecare user a citit macar 5 carti din cele 25 citite de mine
filtered_users = set([k for k in users if users[k] > my_liked_books.shape[0]/1.5])

In [35]:
len(filtered_users)

624

In [36]:
my_liked_books.shape[1]

4

In [37]:
interactions_lists = []

with open("inputs/goodreads_interactions.csv") as f:
    while True:
        line = f.readline()
        if not line:
            break

        user_id, csv_id, _, rating, _ = line.strip().split(",")
        if user_id in filtered_users:
            book_id = csv_book_mapping.get(csv_id)
            interactions_lists.append([user_id, book_id, rating])

In [38]:
#create user-book matrix

In [39]:
len(interactions_lists)

2878507

In [40]:
interactions_lists[0]

['520', '13609836', '2']

In [41]:
interactions = pd.DataFrame(interactions_lists, columns=["user_id", "book_id", "rating"])

In [42]:
interactions = pd.concat([my_liked_books[["user_id","book_id", "rating"]], interactions])

In [43]:
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [44]:
#to help match the rows from the json file to the matrix

In [45]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes+1

In [46]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes+1

In [None]:
df.loc[df['column_name'] == some_value]

In [67]:
interactions.loc[interactions['book_id'] == '16096824']

Unnamed: 0,user_id,book_id,rating,user_index,book_index
21,-1,16096824,5,1,113136
853,520,16096824,5,561,113136
4081,1033,16096824,4,4,113136
8905,2614,16096824,0,265,113136
10408,3556,16096824,4,431,113136
...,...,...,...,...,...
2839593,439717,16096824,5,542,113136
2848061,440351,16096824,0,543,113136
2850690,440975,16096824,0,544,113136
2876629,441524,16096824,4,545,113136


In [48]:
interactions = interactions.reset_index(drop=True)

In [49]:
interactions_matrix = interactions.pivot_table(index='user_index', columns='book_index', values='rating')

In [50]:
interactions_matrix

book_index,1,2,3,4,5,6,7,8,9,10,...,572373,572374,572375,572376,572377,572378,572379,572380,572381,572382
user_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,5.0,0.0,,,,,,,,,...,,0.0,0.0,,,,,,,
3,5.0,,,,,,,,,,...,,,,0.0,,,,,,
4,,,,,,,,,,0.0,...,,,0.0,,,,,,,
5,5.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,5.0,,,,,,,,,,...,,,,,,,,,,
622,4.0,,,,,,,,,,...,,,,,,,,,,
623,5.0,,,,,,,,,,...,,,,,,,,,,
624,,,,,,,,,,,...,,,,,,,,,,


In [51]:
#get cosine similarities for ratings matrix interactions_matrix; pairwise_distances returns the distances between ratings and hence
#similarities are obtained by subtracting distances from 1
cosine_sim = 1-pairwise_distances(interactions_matrix.fillna(0), metric="cosine")

In [52]:
#Cosine similarity matrix
cosine_sim_df=pd.DataFrame(cosine_sim)

In [53]:
cosine_sim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,615,616,617,618,619,620,621,622,623,624
0,1.000000,0.028031,0.140613,0.148369,0.169798,0.161801,0.174414,0.071020,0.011844,0.231132,...,0.216282,0.165253,0.061010,0.060421,0.158937,0.113471,0.086187,0.081753,0.309183,0.174019
1,0.028031,1.000000,0.069610,0.041810,0.051727,0.052986,0.047060,0.036836,0.016071,0.063194,...,0.057706,0.031135,0.021826,0.050139,0.002908,0.088205,0.078494,0.036614,0.015152,0.036336
2,0.140613,0.069610,1.000000,0.171422,0.186417,0.186236,0.191410,0.109218,0.023511,0.158535,...,0.239449,0.137334,0.106488,0.087933,0.117226,0.160828,0.117171,0.067216,0.128648,0.128187
3,0.148369,0.041810,0.171422,1.000000,0.237509,0.200608,0.234102,0.146536,0.017158,0.157850,...,0.222253,0.138340,0.124092,0.104781,0.135947,0.206992,0.116710,0.079284,0.136243,0.126502
4,0.169798,0.051727,0.186417,0.237509,1.000000,0.242433,0.239076,0.149623,0.015667,0.186489,...,0.226084,0.185871,0.124256,0.102443,0.122994,0.192400,0.088177,0.103865,0.179000,0.142771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,0.113471,0.088205,0.160828,0.206992,0.192400,0.183386,0.196328,0.113440,0.016332,0.118308,...,0.175381,0.105526,0.092207,0.087634,0.084459,1.000000,0.131420,0.072406,0.116316,0.113017
621,0.086187,0.078494,0.117171,0.116710,0.088177,0.113276,0.184603,0.074851,0.019751,0.075114,...,0.215881,0.076925,0.078757,0.059315,0.064632,0.131420,1.000000,0.050630,0.153854,0.099312
622,0.081753,0.036614,0.067216,0.079284,0.103865,0.048108,0.069383,0.070519,0.011777,0.116238,...,0.069168,0.071469,0.045922,0.049742,0.044662,0.072406,0.050630,1.000000,0.066399,0.087579
623,0.309183,0.015152,0.128648,0.136243,0.179000,0.181092,0.218471,0.103581,0.009355,0.189389,...,0.229763,0.128992,0.055464,0.057007,0.153566,0.116316,0.153854,0.066399,1.000000,0.197796


In [54]:
pearson_sim = 1-pairwise_distances(interactions_matrix.fillna(0), metric="correlation")

In [55]:
#Pearson correlation similarity matrix
pearson_sim_df = pd.DataFrame(pearson_sim)

In [56]:
print(pearson_sim_df.iloc[:][409])

0      0.443939
1      0.022675
2      0.175536
3      0.209290
4      0.263148
         ...   
620    0.164154
621    0.098882
622    0.068447
623    0.336929
624    0.207614
Name: 409, Length: 625, dtype: float64


In [57]:
pearson_sim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,615,616,617,618,619,620,621,622,623,624
0,1.000000,0.027845,0.140475,0.148267,0.169683,0.161689,0.174303,0.070889,0.011453,0.231058,...,0.216196,0.165145,0.060802,0.060134,0.158850,0.113297,0.086049,0.081575,0.309135,0.173917
1,0.027845,1.000000,0.068637,0.041193,0.050882,0.052233,0.046249,0.036156,0.014171,0.062693,...,0.057086,0.030407,0.020654,0.048377,0.002375,0.086999,0.077769,0.035575,0.014810,0.035648
2,0.140475,0.068637,1.000000,0.170834,0.185603,0.185513,0.190638,0.108515,0.021385,0.158045,...,0.238901,0.136610,0.105282,0.086032,0.116710,0.159577,0.116391,0.066082,0.128338,0.127491
3,0.148267,0.041193,0.170834,1.000000,0.237044,0.200169,0.233654,0.146117,0.015847,0.157535,...,0.221895,0.137891,0.123383,0.103697,0.135627,0.206310,0.116227,0.078601,0.136040,0.126068
4,0.169683,0.050882,0.185603,0.237044,1.000000,0.241856,0.238455,0.149048,0.013843,0.186080,...,0.225600,0.185286,0.123255,0.100877,0.122553,0.191390,0.087489,0.102939,0.178753,0.142185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,0.113297,0.086999,0.159577,0.206310,0.191390,0.182484,0.195372,0.112566,0.013602,0.117665,...,0.174643,0.104583,0.090649,0.085192,0.083786,1.000000,0.130461,0.070976,0.115934,0.112132
621,0.086049,0.077769,0.116391,0.116227,0.087489,0.112677,0.184015,0.074296,0.018151,0.074693,...,0.215444,0.076337,0.077828,0.057853,0.064209,0.130461,1.000000,0.049764,0.153619,0.098766
622,0.081575,0.035575,0.066082,0.078601,0.102939,0.047231,0.068465,0.069760,0.009554,0.115702,...,0.068460,0.070663,0.044594,0.047684,0.044076,0.070976,0.049764,1.000000,0.066039,0.086827
623,0.309135,0.014810,0.128338,0.136040,0.178753,0.180866,0.218250,0.103349,0.008645,0.189228,...,0.229583,0.128757,0.055059,0.056411,0.153399,0.115934,0.153619,0.066039,1.000000,0.197594


In [58]:
global k,metric
k=10
metric='cosine'

In [59]:
#This function finds k similar users given the user_id and ratings matrix interactions_matrix
#Note that the similarities are same as obtained via using pairwise_distances
def findksimilarusers(user_id, ratings, metric = metric, k=k):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') 
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[user_id-1, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
    print ('Cei mai similari {0} utilizatori cu utilizatorul {1}:\n'.format(k,user_id))
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == user_id:
            continue;

        else:
            print ('{0}: Utilizatorul {1}, cu similaritate de {2}'.format(i, indices.flatten()[i]+1, similarities.flatten()[i]))
            
    return similarities,indices

In [60]:
similar_users_cosine,indices_cosine = findksimilarusers(1,interactions_matrix.fillna(0), metric='cosine',k=10)

Cei mai similari 10 utilizatori cu utilizatorul 1:

1: Utilizatorul 410, cu similaritate de 0.4439726369643511
2: Utilizatorul 310, cu similaritate de 0.4230244258711471
3: Utilizatorul 364, cu similaritate de 0.4122309025292783
4: Utilizatorul 391, cu similaritate de 0.3953625375507299
5: Utilizatorul 446, cu similaritate de 0.38355617375467377
6: Utilizatorul 337, cu similaritate de 0.3533911371445513
7: Utilizatorul 341, cu similaritate de 0.34205683295863887
8: Utilizatorul 452, cu similaritate de 0.33951818939212053
9: Utilizatorul 554, cu similaritate de 0.33702280278396846
10: Utilizatorul 469, cu similaritate de 0.31691077168911885


In [61]:
similar_users_pearson,indices_pearson = findksimilarusers(1,interactions_matrix.fillna(0), metric='correlation', k=10)

Cei mai similari 10 utilizatori cu utilizatorul 1:

1: Utilizatorul 410, cu similaritate de 0.44393870169182914
2: Utilizatorul 310, cu similaritate de 0.4229872997187325
3: Utilizatorul 364, cu similaritate de 0.41219494697723913
4: Utilizatorul 391, cu similaritate de 0.3953247583697499
5: Utilizatorul 446, cu similaritate de 0.38353387323597254
6: Utilizatorul 337, cu similaritate de 0.3533492233943646
7: Utilizatorul 341, cu similaritate de 0.34200623245971107
8: Utilizatorul 452, cu similaritate de 0.33946626785664313
9: Utilizatorul 554, cu similaritate de 0.3369732360457156
10: Utilizatorul 469, cu similaritate de 0.31685554827501416


In [62]:
#This function predicts rating for specified user-item combination based on user-based approach
def predict_userbased(user_id, item_id, ratings, metric = metric, k=k):
    prediction=0
    similarities, indices=findksimilarusers(user_id, ratings,metric, k) #similar users based on cosine similarity
    mean_rating = ratings.loc[user_id,:].mean() #to adjust for zero based indexing
    sum_wt = np.sum(similarities)-1
    product=1
    wtd_sum = 0 
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == user_id:
            continue;
        else: 
            ratings_diff = ratings.iloc[indices.flatten()[i],item_id-1]-np.mean(ratings.iloc[indices.flatten()[i],:])
            product = ratings_diff * (similarities[i])
            wtd_sum = wtd_sum + product
    
    prediction = int(round(mean_rating + (wtd_sum/sum_wt)))
    print ('\nRatingul prezis pentru cartea {1} oferit de utilizatorul {0} -> item {1}: {2}'.format(user_id, item_id, prediction))

    return prediction

In [68]:
predict_userbased(1,113136,interactions_matrix.fillna(0));

Cei mai similari 10 utilizatori cu utilizatorul 1:

1: Utilizatorul 410, cu similaritate de 0.4439726369643511
2: Utilizatorul 310, cu similaritate de 0.4230244258711471
3: Utilizatorul 364, cu similaritate de 0.4122309025292783
4: Utilizatorul 391, cu similaritate de 0.3953625375507299
5: Utilizatorul 446, cu similaritate de 0.38355617375467377
6: Utilizatorul 337, cu similaritate de 0.3533911371445513
7: Utilizatorul 341, cu similaritate de 0.34205683295863887
8: Utilizatorul 452, cu similaritate de 0.33951818939212053
9: Utilizatorul 554, cu similaritate de 0.33702280278396846
10: Utilizatorul 469, cu similaritate de 0.31691077168911885

Ratingul prezis pentru cartea 113136 oferit de utilizatorul 1 -> item 113136: 4


In [69]:
predict_userbased(1,113136,interactions_matrix.fillna(0), "correlation");

Cei mai similari 10 utilizatori cu utilizatorul 1:

1: Utilizatorul 410, cu similaritate de 0.44393870169182914
2: Utilizatorul 310, cu similaritate de 0.4229872997187325
3: Utilizatorul 364, cu similaritate de 0.41219494697723913
4: Utilizatorul 391, cu similaritate de 0.3953247583697499
5: Utilizatorul 446, cu similaritate de 0.38353387323597254
6: Utilizatorul 337, cu similaritate de 0.3533492233943646
7: Utilizatorul 341, cu similaritate de 0.34200623245971107
8: Utilizatorul 452, cu similaritate de 0.33946626785664313
9: Utilizatorul 554, cu similaritate de 0.3369732360457156
10: Utilizatorul 469, cu similaritate de 0.31685554827501416

Ratingul prezis pentru cartea 113136 oferit de utilizatorul 1 -> item 113136: 4


In [None]:
#item-based CF

In [71]:
def findksimilaritems(item_id, ratings, metric=metric, k=k):
    similarities=[]
    indices=[]    
    ratings=ratings.T
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[item_id-1, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
    print ('Cele mai similare {0} carti cu cartea {1}:\n'.format(k,item_id))
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == item_id:
            continue;

        else:
            print ('{0}: Cartea cu indexul {1} : are o similaritate de {2}'.format(i,indices.flatten()[i]+1, similarities.flatten()[i]))


    return similarities,indices

In [74]:
similar_item_cosine,indices_item_cosine=findksimilaritems(113136,interactions_matrix.fillna(0))

Cele mai similare 10 carti cu cartea 113136:

1: Cartea cu indexul 154848 : are o similaritate de 0.8862663565025045
2: Cartea cu indexul 128296 : are o similaritate de 0.8610350188305314
3: Cartea cu indexul 211957 : are o similaritate de 0.8471912545828775
4: Cartea cu indexul 157831 : are o similaritate de 0.8430280441577536
5: Cartea cu indexul 530716 : are o similaritate de 0.8134924332517273
6: Cartea cu indexul 279908 : are o similaritate de 0.7765236429113581
7: Cartea cu indexul 355437 : are o similaritate de 0.7625127319362
8: Cartea cu indexul 315379 : are o similaritate de 0.7272003714060193
9: Cartea cu indexul 64094 : are o similaritate de 0.7217982785637389
10: Cartea cu indexul 64369 : are o similaritate de 0.7213822978217801


In [75]:
similar_item_pearson,indices_item_pearson=findksimilaritems(113136,interactions_matrix.fillna(0), metric="correlation")

Cele mai similare 10 carti cu cartea 113136:

1: Cartea cu indexul 154848 : are o similaritate de 0.740415109425794
2: Cartea cu indexul 128296 : are o similaritate de 0.6310987495881947
3: Cartea cu indexul 157831 : are o similaritate de 0.6304691285259112
4: Cartea cu indexul 211957 : are o similaritate de 0.6227453374005739
5: Cartea cu indexul 279908 : are o similaritate de 0.5617671593151672
6: Cartea cu indexul 355437 : are o similaritate de 0.5196620231765149
7: Cartea cu indexul 530716 : are o similaritate de 0.5053643459886252
8: Cartea cu indexul 64094 : are o similaritate de 0.4369792326404307
9: Cartea cu indexul 64369 : are o similaritate de 0.43484554763285255
10: Cartea cu indexul 167712 : are o similaritate de 0.42921647733440005


In [76]:
#item 9 si 10 sunt diferiti intre cosine si pearson

In [77]:
#This function predicts the rating for specified user-item combination based on item-based approach
def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):
    prediction= wtd_sum =0
    similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients
    sum_wt = np.sum(similarities)-1
    product=1
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == item_id:
            continue;
        else:
            product = ratings.iloc[user_id-1,indices.flatten()[i]] * (similarities[i])
            wtd_sum = wtd_sum + product                              
    prediction = int(round(wtd_sum/sum_wt))
    print ('\nRatingul prezis pentru cartea {1} oferit de utilizatorul {0}: {2}'.format(user_id,item_id,prediction) )     

    return prediction

In [78]:
prediction = predict_itembased(1,113136,interactions_matrix.fillna(0), metric ="cosine")

Cele mai similare 10 carti cu cartea 113136:

1: Cartea cu indexul 154848 : are o similaritate de 0.8862663565025045
2: Cartea cu indexul 128296 : are o similaritate de 0.8610350188305314
3: Cartea cu indexul 211957 : are o similaritate de 0.8471912545828775
4: Cartea cu indexul 157831 : are o similaritate de 0.8430280441577536
5: Cartea cu indexul 530716 : are o similaritate de 0.8134924332517273
6: Cartea cu indexul 279908 : are o similaritate de 0.7765236429113581
7: Cartea cu indexul 355437 : are o similaritate de 0.7625127319362
8: Cartea cu indexul 315379 : are o similaritate de 0.7272003714060193
9: Cartea cu indexul 64094 : are o similaritate de 0.7217982785637389
10: Cartea cu indexul 64369 : are o similaritate de 0.7213822978217801

Ratingul prezis pentru cartea 113136 oferit de utilizatorul 1: 5


In [79]:
prediction = predict_itembased(1,113136,interactions_matrix.fillna(0), metric ="correlation")

Cele mai similare 10 carti cu cartea 113136:

1: Cartea cu indexul 154848 : are o similaritate de 0.8862663565025045
2: Cartea cu indexul 128296 : are o similaritate de 0.8610350188305314
3: Cartea cu indexul 211957 : are o similaritate de 0.8471912545828775
4: Cartea cu indexul 157831 : are o similaritate de 0.8430280441577536
5: Cartea cu indexul 530716 : are o similaritate de 0.8134924332517273
6: Cartea cu indexul 279908 : are o similaritate de 0.7765236429113581
7: Cartea cu indexul 355437 : are o similaritate de 0.7625127319362
8: Cartea cu indexul 315379 : are o similaritate de 0.7272003714060193
9: Cartea cu indexul 64094 : are o similaritate de 0.7217982785637389
10: Cartea cu indexul 64369 : are o similaritate de 0.7213822978217801

Ratingul prezis pentru cartea 113136 oferit de utilizatorul 1: 5


In [80]:
#This function utilizes above function to recommend items for selected approach. Recommendations are made if the predicted
#rating for an item is greater than or equal to 6, and the items has not been rated already
def recommendItem(user_id, item_id, ratings):
    
    if user_id<1 or user_id>6 or type(user_id) is not int:
        print ('Userid does not exist. Enter numbers from 1-625')
    else:    
        ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)']

        approach = widgets.Dropdown(options=ids, value=ids[0],
                               description='Select Approach', width='500px')
        
        def on_change(change):
            prediction = 0
            clear_output(wait=True)
            if change['type'] == 'change' and change['name'] == 'value':            
                if (approach.value == 'User-based CF (cosine)'):
                    metric = 'cosine'
                    prediction = predict_userbased(user_id, item_id, ratings, metric)
                elif (approach.value == 'User-based CF (correlation)')  :                       
                    metric = 'correlation'               
                    prediction = predict_userbased(user_id, item_id, ratings, metric)
                elif (approach.value == 'Item-based CF (cosine)'):
                    prediction = predict_itembased(user_id, item_id, ratings)
                else:
                    prediction = predict_itembased_adjcos(user_id,item_id,ratings)

                if ratings[item_id-1][user_id] != 0: 
                    print ('Item already rated')
                else:
                    if prediction>=3:
                        print ('Carte recomandata')
                    else:
                        print ('Carte nerecomandata')

        approach.observe(on_change)
        display(approach)

In [81]:
#check for incorrect entries
recommendItem(1,113136,interactions_matrix.fillna(0))

Dropdown(description='Select Approach', options=('User-based CF (cosine)', 'User-based CF (correlation)', 'Ite…

In [82]:
#This is a quick way to temporarily suppress stdout in particular code section
@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:  
            yield
        finally:
            sys.stdout = old_stdout

In [83]:
#This is final function to evaluate the performance of selected recommendation approach and the metric used here is RMSE
#suppress_stdout function is used to suppress the print outputs of all the functions inside this function. It will only print 
# This method will output the following evaluation prediction metrics: RMSE, Recall, Precision, F1 Score
def evaluate_predictions(interactions, target, type, metric):
    # ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)']
    # approach = widgets.Dropdown(options=ids, value=ids[0],description='Select Approach', width='500px')
    n_users = len(set(target["user_index"]))
    n_items = target[target["user_index"] == list(set(target["user_index"]))[0]].shape[0]
    prediction = np.zeros((n_users, n_items))
    target_prediction = np.zeros((n_users, n_items))

    i = 0
    j = 0
    with suppress_stdout():
        for user in set(target["user_index"]):
            user_target = target[target["user_index"] == user]
            
            for index, row in user_target.iterrows():
                if type == "user_based":
                    prediction[i][j] = predict_userbased(row["user_index"], row["book_index"], interactions, metric)
                else:
                     prediction[i][j] = predict_itembased(row["user_index"], row["book_index"], interactions, metric)
                target_prediction[i][j] = row["rating"]
                j += 1
                
            i += 1
    results = []        
    prediction= pd.DataFrame(prediction)
    target_prediction = pd.DataFrame(target_prediction)
    print(target_prediction)
    print(prediction)
    
    # 1. Compute RMSE
    RS = np.sqrt(np.mean((prediction-target_prediction)**2))
    print("Valoarea RMSE pentru predictia de tip {}, metrica {} is: {}".format(type, metric, RS))
    results.append(RS)

    # 2. Compute MAE
    mae = mean_absolute_error(target_prediction, prediction)
    print("Valoarea MAE pentru predictia de tip {}, metrica {} is: {}".format(type, metric, mae))
    results.append(mae)

    # 3. Compute Precision, Recall, F1 Score @ K
    TPs = 0
    FNs = 0

    # The number of relevant items are the items with actual rating greater or equal to 3.5.
    threshold = 3.5
    
    # total_predictions = target.shape[0] * target.shape[1]

    # Find the relevant items using the threshold
    relevant_items = []
    for i in range(0, target_prediction.shape[0]):
        for j in range(0, target_prediction.shape[1]):
            if target_prediction.values[i, j] > threshold:
                relevant_items.append((i, j))

    # Compute K
    k = len(relevant_items)

    # Recommended items @ k
    recommended_items_at_k = []
    for i in range(0, target_prediction.shape[0]):
        for j in range(0, target_prediction.shape[1]):
            if prediction.values[i, j] > threshold:
                recommended_items_at_k.append((i, j))

    # Recommended and Relevant items @ k (Intersection)
    recomm_and_relevant_items_at_k = list(set(relevant_items) & set(recommended_items_at_k))

    # Compute Precision @ K
    precision_at_k = len(recomm_and_relevant_items_at_k) / len(recommended_items_at_k)

    print("Precizia cu k={}, pentru predictia de tip {}, metrica {}, is: {}".format(k, type, metric, precision_at_k))

    # Compute Recall @ K
    recall_at_k = len(recomm_and_relevant_items_at_k) / len(relevant_items)

    print("Recall at k={}, pentru predictia de tip {}, metrica {}, is: {}".format(k, type, metric, recall_at_k))

    # Compute F1 score @ K
    f1_score_at_k = 2 * precision_at_k * recall_at_k / (precision_at_k + recall_at_k)

    print("Scorul F1 at k={}, pentru predictia de tip {}, metrica {}, is: {}".format(k, type, metric, f1_score_at_k))
            
    return RS

In [84]:
target = interactions[interactions["user_index"] == 1][["book_index", "rating", "user_index", "book_id"]]

In [85]:
# TODO: We need to run these for all types (user + item) and all metrics: cosine, correlation, etc

In [86]:
result = evaluate_predictions(interactions_matrix.fillna(0), target, "item_based", "cosine")

    0    1    2    3    4    5    6    7    8    9   ...   15   16   17   18   
0  5.0  3.0  3.0  4.0  5.0  5.0  5.0  4.0  4.0  5.0  ...  5.0  4.0  4.0  5.0  \

    19   20   21   22   23   24  
0  5.0  4.0  5.0  5.0  5.0  5.0  

[1 rows x 25 columns]
    0    1    2    3    4    5    6    7    8    9   ...   15   16   17   18   
0  4.0  1.0  1.0  4.0  4.0  5.0  4.0  1.0  1.0  5.0  ...  4.0  4.0  1.0  4.0  \

    19   20   21   22   23   24  
0  4.0  1.0  5.0  5.0  4.0  1.0  

[1 rows x 25 columns]
Valoarea RMSE pentru predictia de tip item_based, metrica cosine is: 1.8439088914585775
Valoarea MAE pentru predictia de tip item_based, metrica cosine is: 1.32
Precizia cu k=22, pentru predictia de tip item_based, metrica cosine, is: 0.9411764705882353
Recall at k=22, pentru predictia de tip item_based, metrica cosine, is: 0.7272727272727273
Scorul F1 at k=22, pentru predictia de tip item_based, metrica cosine, is: 0.8205128205128205


In [87]:
result = evaluate_predictions(interactions_matrix.fillna(0), target, "item_based", "correlation")

    0    1    2    3    4    5    6    7    8    9   ...   15   16   17   18   
0  5.0  3.0  3.0  4.0  5.0  5.0  5.0  4.0  4.0  5.0  ...  5.0  4.0  4.0  5.0  \

    19   20   21   22   23   24  
0  5.0  4.0  5.0  5.0  5.0  5.0  

[1 rows x 25 columns]
    0    1    2    3    4    5    6    7    8    9   ...   15   16   17   18   
0  4.0  1.0  1.0  4.0  4.0  5.0  4.0  1.0  1.0  5.0  ...  4.0  4.0  1.0  4.0  \

    19   20   21   22   23   24  
0  4.0  1.0  5.0  5.0  4.0  1.0  

[1 rows x 25 columns]
Valoarea RMSE pentru predictia de tip item_based, metrica correlation is: 1.8439088914585775
Valoarea MAE pentru predictia de tip item_based, metrica correlation is: 1.32
Precizia cu k=22, pentru predictia de tip item_based, metrica correlation, is: 0.9411764705882353
Recall at k=22, pentru predictia de tip item_based, metrica correlation, is: 0.7272727272727273
Scorul F1 at k=22, pentru predictia de tip item_based, metrica correlation, is: 0.8205128205128205


In [88]:
result = evaluate_predictions(interactions_matrix.fillna(0), target, "user_based", "cosine")

    0    1    2    3    4    5    6    7    8    9   ...   15   16   17   18   
0  5.0  3.0  3.0  4.0  5.0  5.0  5.0  4.0  4.0  5.0  ...  5.0  4.0  4.0  5.0  \

    19   20   21   22   23   24  
0  5.0  4.0  5.0  5.0  5.0  5.0  

[1 rows x 25 columns]
    0    1    2    3    4    5    6    7    8    9   ...   15   16   17   18   
0  4.0  0.0  0.0  4.0  3.0  3.0  3.0  0.0  0.0  5.0  ...  4.0  1.0  0.0  4.0  \

    19   20   21   22   23   24  
0  3.0  0.0  4.0  5.0  5.0  0.0  

[1 rows x 25 columns]
Valoarea RMSE pentru predictia de tip user_based, metrica cosine is: 2.4979991993593593
Valoarea MAE pentru predictia de tip user_based, metrica cosine is: 1.92
Precizia cu k=22, pentru predictia de tip user_based, metrica cosine, is: 1.0
Recall at k=22, pentru predictia de tip user_based, metrica cosine, is: 0.45454545454545453
Scorul F1 at k=22, pentru predictia de tip user_based, metrica cosine, is: 0.625


In [89]:
result = evaluate_predictions(interactions_matrix.fillna(0), target, "user_based", "correlation")

    0    1    2    3    4    5    6    7    8    9   ...   15   16   17   18   
0  5.0  3.0  3.0  4.0  5.0  5.0  5.0  4.0  4.0  5.0  ...  5.0  4.0  4.0  5.0  \

    19   20   21   22   23   24  
0  5.0  4.0  5.0  5.0  5.0  5.0  

[1 rows x 25 columns]
    0    1    2    3    4    5    6    7    8    9   ...   15   16   17   18   
0  4.0  0.0  0.0  4.0  3.0  3.0  3.0  0.0  0.0  5.0  ...  4.0  1.0  0.0  4.0  \

    19   20   21   22   23   24  
0  3.0  0.0  4.0  5.0  5.0  0.0  

[1 rows x 25 columns]
Valoarea RMSE pentru predictia de tip user_based, metrica correlation is: 2.4979991993593593
Valoarea MAE pentru predictia de tip user_based, metrica correlation is: 1.92
Precizia cu k=22, pentru predictia de tip user_based, metrica correlation, is: 1.0
Recall at k=22, pentru predictia de tip user_based, metrica correlation, is: 0.45454545454545453
Scorul F1 at k=22, pentru predictia de tip user_based, metrica correlation, is: 0.625


In [90]:
books_titles = pd.read_json("inputs/parsed_booksv2.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [91]:
recc_items_pearson = pd.concat([pd.DataFrame(indices_item_pearson), pd.DataFrame(similar_item_pearson).T], axis=0)

In [92]:
recc_items_pearson = recc_items_pearson.T

In [93]:
recc_items_pearson.columns= ["book_index", "similarity_score"]

In [94]:
# pd.to_numeric("book_index")
# recc_items_pearson["book_index"] = recc_items_pearson["book_index"].astype(np.int64)

In [95]:
# pd.to_numeric("book_index")
# recc_items_pearson["book_index"] = recc_items_pearson["book_index"].astype(str)

In [96]:
recc_items_pearson

Unnamed: 0,book_index,similarity_score
0,113135.0,1.0
1,154847.0,0.740415
2,128295.0,0.631099
3,157830.0,0.630469
4,211956.0,0.622745
5,279907.0,0.561767
6,355436.0,0.519662
7,530715.0,0.505364
8,64093.0,0.436979
9,64368.0,0.434846


In [97]:
recc_items_pearson_v5 = recc_items_pearson.merge(interactions, how="inner", on="book_index")

In [98]:
set(recc_items_pearson_v5["book_id"])

{'13415552',
 '13419829',
 '1609681',
 '17167159',
 '17927379',
 '18006456',
 '18243697',
 '20613468',
 '23766623',
 '28260585',
 '7896458'}

In [99]:
recc_items_pearson_v5["book_index"] = recc_items_pearson_v5["book_index"].astype(np.int64)
recc_items_pearson_v5["book_id"] = recc_items_pearson_v5["book_id"].astype(str)

In [100]:
recc_items_pearson_v5 = recc_items_pearson_v5.drop_duplicates(subset=["book_index"])

In [101]:
recc_items_pearson_v5

Unnamed: 0,book_index,similarity_score,user_id,book_id,rating,user_index
0,113135,1.0,339532,1609681,0,407
1,154847,0.740415,119660,17927379,0,29
6,128295,0.631099,59545,17167159,0,570
8,157830,0.630469,32978,18006456,0,392
24,211956,0.622745,165733,20613468,0,105
27,279907,0.561767,135938,23766623,5,54
35,355436,0.519662,31908,28260585,4,378
40,530715,0.505364,100887,7896458,0,2
45,64093,0.436979,224854,13415552,0,202
46,64368,0.434846,100887,13419829,3,2


In [102]:
books_info_v = []
with gzip.open("inputs/goodreads_books.json.gz", 'r') as f:
    while line := f.readline():
        for item in recc_items_pearson_v5["book_id"]:
            if bytes(str(item), encoding="UTF8") in line:
                line_v2 = json.loads(line)
                if line_v2["book_id"] == item:
                    books_info_v.append(get_needed_fields(line_v2))
                    break

In [103]:
recc_items_pearson_merge_book_titles = recc_items_pearson_v5.merge(pd.DataFrame(books_info_v), how="inner", on="book_id")

In [104]:
recc_items_pearson_merge_book_titles_v2 = recc_items_pearson_merge_book_titles.drop(columns=["user_id", "rating", "user_index"])
# recc_items_pearson_merge_book_titles_v2 = recc_items_pearson_merge_book_titles

In [105]:
recc_items_pearson_merge_book_titles_v2.drop_duplicates(subset=['book_index'])

Unnamed: 0,book_index,similarity_score,book_id,title,ratings_count,url,image_url,average_rating,authors,publisher
0,113135,1.0,1609681,Georges Méliès,16,https://www.goodreads.com/book/show/1609681.Ge...,https://images.gr-assets.com/books/1311990500m...,3.88,"[{'author_id': '534026', 'role': ''}]",Manchester University Press
1,154847,0.740415,17927379,The Queen of All Magick Elizabeth (Aaron's Kis...,135,https://www.goodreads.com/book/show/17927379-t...,https://images.gr-assets.com/books/1368649115m...,4.16,"[{'author_id': '4787929', 'role': ''}]",
2,128295,0.631099,17167159,Are We Lost? A Raymond and Sheila Story,11,https://www.goodreads.com/book/show/17167159-a...,https://images.gr-assets.com/books/1356387039m...,4.33,"[{'author_id': '303519', 'role': ''}, {'author...",Electric Eggplant
3,157830,0.630469,18006456,"Shards of Time (Nightrunner, #7)",1736,https://www.goodreads.com/book/show/18006456-s...,https://images.gr-assets.com/books/1377657570m...,4.25,"[{'author_id': '42110', 'role': ''}]",Del Rey
4,211956,0.622745,20613468,Independently Wealthy,182,https://www.goodreads.com/book/show/20613468-i...,https://images.gr-assets.com/books/1394629986m...,3.72,"[{'author_id': '4023658', 'role': ''}]",Thomas Dunne Books
5,279907,0.561767,23766623,A Court of Mist and Fury (A Court of Thorns an...,4277,https://www.goodreads.com/book/show/23766623-a...,https://images.gr-assets.com/books/1452783865m...,4.71,"[{'author_id': '3433047', 'role': ''}]",Bloomsbury Childrens Books
6,355436,0.519662,28260585,"Queen of Shadows (Throne of Glass, #4)",670,https://www.goodreads.com/book/show/28260585-q...,https://images.gr-assets.com/books/1459429184m...,4.6,"[{'author_id': '3433047', 'role': ''}]",
7,530715,0.505364,7896458,Heartstone,345,https://www.goodreads.com/book/show/7896458-he...,https://images.gr-assets.com/books/1327174229m...,4.31,"[{'author_id': '80212', 'role': ''}]",Mantle
8,64093,0.436979,13415552,Changed (Origins #2),11,https://www.goodreads.com/book/show/13415552-c...,https://images.gr-assets.com/books/1365619030m...,4.67,"[{'author_id': '4072281', 'role': ''}]",Jean Booth
9,64368,0.434846,13419829,التوأم الشرير,175,https://www.goodreads.com/book/show/13419829,https://s.gr-assets.com/assets/nophoto/book/11...,3.35,"[{'author_id': '13730', 'role': ''}]",


In [106]:
my_liked_books_v2 = my_liked_books.merge(interactions, how="inner", on="book_id")

In [107]:
my_liked_books_v2.drop(columns=["user_id_y", "rating_y", "user_index","rating_x"])

Unnamed: 0,user_id_x,book_id,title,book_index
0,-1,13104080,unravel me shatter me 2,55923
1,-1,13104080,unravel me shatter me 2,55923
2,-1,13104080,unravel me shatter me 2,55923
3,-1,13104080,unravel me shatter me 2,55923
4,-1,13104080,unravel me shatter me 2,55923
...,...,...,...,...
11252,-1,227729,the coffin dancer lincoln rhyme 2,253051
11253,-1,227729,the coffin dancer lincoln rhyme 2,253051
11254,-1,227729,the coffin dancer lincoln rhyme 2,253051
11255,-1,227729,the coffin dancer lincoln rhyme 2,253051


In [108]:
my_liked_books_v2 = my_liked_books_v2.drop_duplicates(subset="book_index")

In [109]:
#de aici pot lua book_index si sa folosesc in item-based similarity 
my_liked_books_v2

Unnamed: 0,user_id_x,book_id,rating_x,title,user_id_y,rating_y,user_index,book_index
0,-1,13104080,5,unravel me shatter me 2,-1,5,1,55923
576,-1,395875,3,dark reunion the vampire diaries 4,-1,3,1,448037
825,-1,1169808,3,the fury the vampire diaries 3,-1,3,1,29683
1083,-1,10429045,4,shatter me shatter me 1,-1,4,1,7556
1651,-1,13565676,5,the assassin and the empire throne of glass 05,-1,5,1,72241
2238,-1,23766634,5,a court of wings and ruin a court of thorns an...,-1,5,1,279908
2804,-1,13419891,5,the assassin and the desert throne of glass 03,-1,5,1,64369
3388,-1,142540,4,the empty chair lincoln rhyme 3,-1,4,1,82700
3439,-1,395922,4,the awakening the vampire diaries 1,-1,4,1,448045
3751,-1,20613470,5,heir of fire throne of glass 3,-1,5,1,211957
