In [None]:
# import
import pandas as pd
import numpy as np


LOR_PART_ONE_TITLE = 'the fellowship of the ring (the lord of the rings, part 1)'


# load ratings
ratings = pd.read_csv('BX-Book-Ratings.csv', encoding='cp1251', sep=';')
ratings = ratings[ratings['Book-Rating']!=0]

# load books
books = pd.read_csv('BX-Books.csv', encoding='cp1251', sep=';', error_bad_lines=False)

# TOMAS - I would select only relevant columns per each dataframe before joining them.
#         We could reduce the memory usage as well as speed up future operations.

#users_ratigs = pd.merge(ratings, users, on=['User-ID'])
dataset = pd.merge(ratings, books, on=['ISBN'])

# TOMAS - At this point I would suggest re-typing the columns where needed. E.g. for string type we could use "StringDtype".
dataset_lowercase = dataset.apply(lambda x: x.str.lower() if (x.dtype == 'object') else x)

tolkien_readers = dataset_lowercase['User-ID'][(dataset_lowercase['Book-Title']==LOR_PART_ONE_TITLE) & (dataset_lowercase['Book-Author'].str.contains("tolkien"))]
tolkien_readers = tolkien_readers.unique()

# final dataset
books_of_tolkien_readers = dataset_lowercase[(dataset_lowercase['User-ID'].isin(tolkien_readers))]

# Number of ratings per other books in dataset
number_of_rating_per_book = books_of_tolkien_readers.groupby(['Book-Title']).agg('count').reset_index()

# select only books which have actually higher number of ratings than threshold
books_to_compare = number_of_rating_per_book['Book-Title'][number_of_rating_per_book['User-ID'] >= 8]
books_to_compare = books_to_compare.tolist()

ratings_data_raw = books_of_tolkien_readers[['User-ID', 'Book-Rating', 'Book-Title']][books_of_tolkien_readers['Book-Title'].isin(books_to_compare)]

# group by User and Book and compute mean
ratings_data_raw_nodup = ratings_data_raw.groupby(['User-ID', 'Book-Title'])['Book-Rating'].mean()

# reset index to see User-ID in every row
ratings_data_raw_nodup = ratings_data_raw_nodup.to_frame().reset_index()

dataset_for_corr = ratings_data_raw_nodup.pivot(index='User-ID', columns='Book-Title', values='Book-Rating')


In [24]:
LoR_list = [LOR_PART_ONE_TITLE]

result_list = []
worst_list = []

# for each of the trilogy book compute:
for LoR_book in LoR_list:
    
    #Take out the Lord of the Rings selected book from correlation dataframe
    dataset_of_other_books = dataset_for_corr.copy(deep=False)
    dataset_of_other_books.drop([LoR_book], axis=1, inplace=True)
      
    # empty lists
    book_titles = []
    correlations = []
    avgrating = []

    # TOMAS - TODO - could be vectorized?
    # corr computation
    for book_title in list(dataset_of_other_books.columns.values):
        book_titles.append(book_title)
        correlations.append(dataset_for_corr[LoR_book].corr(dataset_of_other_books[book_title]))

        # tab = ratings_data_raw[ratings_data_raw['Book-Title']==book_title].groupby(ratings_data_raw['Book-Title']).mean()
        # avgrating.append(tab['Book-Rating'].min())
        # TOMAS - same thing
        book_avg = ratings_data_raw[ratings_data_raw['Book-Title']==book_title]['Book-Rating'].mean()
        avgrating.append(book_avg)

    # final dataframe of all correlation of each book   
    corr_fellowship = pd.DataFrame(list(zip(book_titles, correlations, avgrating)), columns=['book', 'corr', 'avg_rating'])
    corr_fellowship.head()

    # top 10 books with highest corr
    result_list.append(corr_fellowship.sort_values('corr', ascending = False).head(10))
    
    # worst 10 books
    worst_list.append(corr_fellowship.sort_values('corr', ascending = False).tail(10))
    
print("Correlation for book:", LoR_list[0])
avg_lor_rating = ratings_data_raw[ratings_data_raw['Book-Title']==LOR_PART_ONE_TITLE].groupby(ratings_data_raw['Book-Title']).mean()
print(f"Average rating of LOR: {avg_lor_rating}")
rslt = result_list[0]

Correlation for book: the fellowship of the ring (the lord of the rings, part 1)
Average rating of LOR:                                                           User-ID  Book-Rating
Book-Title                                                                    
the fellowship of the ring (the lord of the rin...  143720.521951     8.882927


In [25]:
result_list

[                                                 book      corr  avg_rating
 33                                           stardust  0.909450    7.500000
 38  the drawing of the three (the dark tower, book 2)  0.907758    8.000000
 53                              the phantom tollbooth  0.896262    8.500000
 24                                                 it  0.887229    8.333333
 66                                      the testament  0.852803    8.000000
 4                                   a wrinkle in time  0.848478    8.357143
 9       ender's game (ender wiggins saga (paperback))  0.836660    9.307692
 29                                       pet sematary  0.830264    7.636364
 47                the hobbit: or there and back again  0.820499    8.800000
 40                                           the gift  0.808511    7.750000]

# Vectorization

### Partially vectorized version of computing correlation+avg_rating matrix

In [98]:
LoR_list = [LOR_PART_ONE_TITLE]

result_list = []
worst_list = []

# for each of the trilogy book compute:
for LoR_book in LoR_list:
    
    # take out the Lord of the Rings selected book from correlation dataframe
    dataset_of_other_books = dataset_for_corr.copy(deep=False)
    dataset_of_other_books.drop([LoR_book], axis=1, inplace=True)

    titles = dataset_of_other_books.columns.values
    correlations = dataset_of_other_books.corrwith(dataset_for_corr[LoR_book])
    avg_ratings = ratings_data_raw.groupby('Book-Title')['Book-Rating'].mean().drop(LoR_book)

    corr_fellowship = pd.DataFrame({'book': titles, 'corr': correlations, 'avg_rating': avg_ratings}).reset_index(drop=True)

    # TOMAS - In case of multiple LoR_books, we might receive duplicates. Also, I am not sure about the exact specification,
    #         but returning +10 books per each LoR_book seems weird. Maybe it should be a fixed amount of top books?
    #         We could join the results, average the duplicate book reccomendations corr coefs, sort, and return top-k.
    
    # top 10 books with highest corr
    result_list.append(corr_fellowship.sort_values('corr', ascending = False).head(10))

    # worst 10 books
    worst_list.append(corr_fellowship.sort_values('corr', ascending = False).tail(10))

print("Correlation for book:", LoR_list[0])
avg_lor_rating = ratings_data_raw[ratings_data_raw['Book-Title']==LOR_PART_ONE_TITLE].groupby(ratings_data_raw['Book-Title']).mean()
print(f"Average rating of LOR: {avg_lor_rating}")
rslt = result_list[0]

Correlation for book: the fellowship of the ring (the lord of the rings, part 1)
Average rating of LOR:                                                           User-ID  Book-Rating
Book-Title                                                                    
the fellowship of the ring (the lord of the rin...  143720.521951     8.882927


In [100]:
result_list

[                                                 book      corr  avg_rating
 33                                           stardust  0.909450    7.500000
 38  the drawing of the three (the dark tower, book 2)  0.907758    8.000000
 53                              the phantom tollbooth  0.896262    8.500000
 24                                                 it  0.887229    8.333333
 66                                      the testament  0.852803    8.000000
 4                                   a wrinkle in time  0.848478    8.357143
 9       ender's game (ender wiggins saga (paperback))  0.836660    9.307692
 29                                       pet sematary  0.830264    7.636364
 47                the hobbit: or there and back again  0.820499    8.800000
 40                                           the gift  0.808511    7.750000]