In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
cd "/content/drive/My Drive/IRS"

/content/drive/My Drive/IRS


In [None]:

import numpy as np
import pandas as pd
import seaborn as sns
import nltk

In [None]:
books = pd.read_csv('./goodbooks-10k/books.csv').loc[:,['book_id', 'goodreads_book_id', 'title', 'authors']]
ratings = pd.read_csv('./goodbooks-10k/ratings.csv')
book_tags = pd.read_csv('./goodbooks-10k/book_tags.csv')
tags = pd.read_csv('./goodbooks-10k/tags.csv')

print(books.shape)
display(books.columns)

print(ratings.shape)
display(ratings.columns)

print(book_tags.shape)
display(book_tags.columns)

print(tags.shape)
display(tags.columns)

(10000, 4)


Index(['book_id', 'goodreads_book_id', 'title', 'authors'], dtype='object')

(5976479, 3)


Index(['user_id', 'book_id', 'rating'], dtype='object')

(999912, 3)


Index(['goodreads_book_id', 'tag_id', 'count'], dtype='object')

(34252, 2)


Index(['tag_id', 'tag_name'], dtype='object')

In [None]:
ratings.user_id.max()

53424

In [None]:
ratings = ratings.append(pd.DataFrame({
    'user_id' : [53425, 53425, 53425,53425],
    'book_id' : [2,18,23,25],
    'rating' : [4.5, 4.1, 3.8, 4.2]
}), ignore_index = True)

In [None]:
book_tags = book_tags.merge(right = books, how = 'inner', on = 'goodreads_book_id')


In [None]:
from scipy.sparse import csr_matrix, dok_matrix, diags

indices = np.array([(index, tagID) for index,(title, data) in enumerate(book_tags.sort_values(['title']).groupby(['title'])) for tagID in data['tag_id'].to_numpy()])
title_mapping = dict([(title, index) for index,(title, data) in enumerate(book_tags.sort_values(['title']).groupby(['title']))])

row_index = indices[:, 0]
col_index = indices[:, 1]

values = np.ones(indices.shape[0])


n_books = books.title.unique().shape[0]
n_tags = tags.tag_id.max() + 1

tf = csr_matrix((values, (row_index, col_index)), shape = (n_books, n_tags))
df = np.bincount(tf.indices, minlength=tf.shape[1])


weights = diags(1/(np.array(tf.sum(axis = 1))[:,0]), offsets = 0, shape = (n_books, n_books))
tf = weights@tf

idf = np.log(n_books/(df + 1)) + 1

idf = diags(idf, 0, shape=(n_tags, n_tags), format = 'csr')

print(tf.shape, idf.shape)
books_vector = tf@idf

print(type(books_vector))

(9964, 34252) (34252, 34252)
<class 'scipy.sparse.csr.csr_matrix'>


In [None]:
user_book_ratings = ratings.merge(right = books,how = 'inner', on = 'book_id')


In [None]:
from scipy.sparse.linalg import norm
from scipy.sparse import csr_matrix, dok_matrix

user_ratings = user_book_ratings.groupby(['user_id', 'title']).agg({'rating':'mean'})

user_ratings = user_ratings.reset_index()

indices = np.array([(userID, title_mapping[title]) for userID, title, rating in user_ratings.to_numpy()])

row_index = indices[:, 0]
col_index = indices[:, 1]
values = np.array([rating for userID, title, rating in user_ratings.to_numpy() ])

n_users = user_ratings.user_id.max() + 1

print(row_index.shape, n_users)
user_pref = csr_matrix((values, (row_index, col_index)), shape=(n_users, n_books))

(5975165,) 53426


In [None]:
from scipy.sparse.linalg import norm

def get_recommendations(user):
  
  user_profile = np.array(user_pref[user].toarray())
  mean_rating = user_profile.mean()

  normalized_ratings = csr_matrix([(val - mean_rating) if val > 0 else val for val in user_profile[0,:]])
  

  user_profile = normalized_ratings@books_vector

  print(user_profile.shape, user_profile.getnnz())
  
  cosines = np.array(user_profile @ books_vector.T / norm(user_profile) / norm(books_vector, axis = 1) )

  most_similar = pd.DataFrame({'similarity score' : cosines.flatten()}, index = range(cosines.shape[1]))
  
  
  titles = np.array([title for title in books.sort_values(['title']).title.unique()])
  books_read_by_user = [title_mapping[book] for book in user_book_ratings[user_book_ratings.user_id == user]['title'].to_numpy()]

  most_similar = most_similar[~most_similar.index.isin(books_read_by_user)]
  most_similar.sort_values(by = 'similarity score',ascending = False, inplace = True)
  
  recommendations = most_similar.merge(right = pd.DataFrame({
      'book title' : title_mapping.keys()
  },index = title_mapping.values()) , how = 'inner', left_index = True, right_index = True)

  print(tags[tags.tag_id.isin(np.argsort(user_profile.toarray()[0])[::-1][:20])]['tag_name'])
  top10 = recommendations[: 10]

  top_tags = np.array(books_vector[top10.index].toarray()).argsort(axis = 1)[:,::-1][:,:10]
  display([tags[tags.tag_id.isin(row)]['tag_name'].values for row in top_tags])
  display(books[books.title.isin(user_book_ratings[user_book_ratings.user_id == user]['title'].to_numpy())])
  display(top10)

In [None]:
get_recommendations(53425)

(1, 34252) 111
2106       all-time-favourites
6838       childhood-favorites
6914            children-s-lit
6920     children-s-literature
11491                    faves
11574                favourite
11579          favourite-books
14017             harry-potter
14024      harry-potter-series
15965              j-k-rowling
16427               jk-rowling
19974             middle-grade
24092                   potter
24964                 re-reads
25234      read-more-than-once
25770                  rereads
30573               to-re-read
32623                  witches
32654                  wizards
33165                    youth
Name: tag_name, dtype: object


[array(['all-time-favourites', 'favourite-books', 'harry-potter',
        'harry-potter-series', 'j-k-rowling', 'jk-rowling', 'magical',
        'potter', 'rereads', 'wizards'], dtype=object),
 array(['all-time-favourites', 'favourite-books', 'harry-potter',
        'harry-potter-series', 'j-k-rowling', 'jk-rowling', 'potter',
        'rereads', 'séries', 'wizards'], dtype=object),
 array(['fairytale', 'fairytales', 'harry-potter', 'harry-potter-series',
        'hp', 'j-k-rowling', 'jk-rowling', 'potter', 'rowling', 'wizards'],
       dtype=object),
 array(['50-books-to-read-before-you-die', 'absolute-favorites',
        'all-time-faves', 'all-time-favs', 'favourite-series', 'harry',
        'harry-potter', 'j-k-rowling', 'my-favorite-books', 'potter'],
       dtype=object),
 array(['companion-books', 'fantasía', 'harry-potter',
        'harry-potter-series', 'hp', 'j-k-rowling', 'jk-rowling', 'potter',
        'rowling', 'wizards'], dtype=object),
 array(['boarding-school', 'harry-po

Unnamed: 0,book_id,goodreads_book_id,title,authors
1,2,3,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré"
17,18,5,Harry Potter and the Prisoner of Azkaban (Harr...,"J.K. Rowling, Mary GrandPré, Rufus Beck"
22,23,15881,Harry Potter and the Chamber of Secrets (Harry...,"J.K. Rowling, Mary GrandPré"
24,25,136251,Harry Potter and the Deathly Hallows (Harry Po...,"J.K. Rowling, Mary GrandPré"


Unnamed: 0,similarity score,book title
2959,0.93701,Harry Potter and the Half-Blood Prince (Harry ...
2958,0.935692,Harry Potter and the Goblet of Fire (Harry Pot...
8666,0.647403,The Tales of Beedle the Bard
2949,0.621682,"Harry Potter Boxset (Harry Potter, #1-7)"
5135,0.618315,Quidditch Through the Ages
2960,0.611508,Harry Potter and the Order of the Phoenix (Har...
2950,0.603957,"Harry Potter Collection (Harry Potter, #1-6)"
2355,0.583565,Fantastic Beasts and Where to Find Them
2961,0.495631,Harry Potter and the Order of the Phoenix (Har...
7293,0.473323,"The Harry Potter Collection 1-4 (Harry Potter,..."


In [None]:
from scipy.sparse import csr_matrix, dok_matrix
from scipy.sparse.linalg import norm

vector = np.arange(start = 6, stop = 0, step = -1).reshape(2, 3)
print(vector)
matrix = csr_matrix(np.array([[1, 0 ,2],[2, 1 , 0],[0, 0, 0]]))
matrix.toarray()

[[6 5 4]
 [3 2 1]]


array([[1, 0, 2],
       [2, 1, 0],
       [0, 0, 0]], dtype=int64)

In [None]:
matrix[0,1] = 1
matrix[0,3] = 1
matrix[2,2] = 1
matrix[3,3] = 1

matrix = csr_matrix(matrix)

  self._set_intXint(row, col, x.flat[0])


IndexError: ignored

In [None]:
print(matrix.toarray())
print(matrix.mean(axis = 1))

In [None]:
matrix2 = csr_matrix(np.array([[1, 0 ,2],[2, 1 , 0],[0, 0, 0]]))
print(matrix.toarray(),'\n', matrix2.toarray())
(matrix@matrix2).toarray()

In [None]:
matrix/matrix.sum(axis = 1)

In [None]:
books[books.title.str.contains('Harry Potter')] 
#goodreads id : 2,18,23,25