In [None]:
!pip install efficient-apriori 
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from efficient_apriori import apriori # the apriori algorithm (finds association rules in a series of transactions)


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

First, we import all of our data into Pandas dataframes.

In [None]:
booksRaw =  pd.read_csv("../input/books.csv")
tagsRaw = pd.read_csv("../input/tags.csv")
bookTagsRaw = pd.read_csv("../input/book_tags.csv")
ratingsRaw = pd.read_csv("../input/ratings.csv")
to_readRaw = pd.read_csv("../input/to_read.csv")

We merge bookTags with tags, which associates tag names with the book tags. Then, we group the tags by book_id to streamline the process of making a keyword list for each book.

In [None]:
all_tags = pd.merge(bookTagsRaw, tagsRaw, left_on='tag_id', right_on='tag_id', how='inner')
#all_tags = pd.concat([all_tags, booksRaw[['book_id','authors']].rename(columns={'book_id':'goodreads_book_id'})])
all_tags_grouped = all_tags.groupby("goodreads_book_id")

#all_tags[all_tags.goodreads_book_id == 10]
all_tags.tail(100)

We group the ratings by users in preparation for making the user profiles.

In [None]:
ratingsGrouped = ratingsRaw.sort_values('user_id').groupby(by='user_id')
ratingsGrouped.get_group(1)

Time to make the user profiles. Each profile has a list of keyword lists for all rated books. We also add a special "like" or "dislike" word to each list of keywords that depends on the user's rating of the book. 

We also keep two aggregate lists of all keywords of high-rated books and low-rated books. We remove the intersection of the two from the list of all tags. If both a 5-star book and 1-star book are both tagged with "sci-fi", we can't say much about how the user rates sci-fi books.

In [None]:
class UserProfile:
  def __init__(self, user_id):
    self.userID = user_id;
    self.allTags = [];
    self.highWords = set()
    self.lowWords = set()
    
  def makeUserProfile(self):
      ratedBooks = ratingsGrouped.get_group(self.userID)
      high = ratedBooks[ratedBooks.rating >= 3]
      low = ratedBooks[ratedBooks.rating < 3]
      
      print("Highly rated: ", high)
      print("Didn't think much of these: ", low)
      for index, row in high.iterrows():
        #Add tags of current book to list of keywords
        taglist = all_tags[all_tags.goodreads_book_id == row['book_id']].tag_name.tolist()
        
        #Add author names to list of keywords for current book
        authors = booksRaw[booksRaw.book_id == row['book_id']].authors.tolist()
        
        if (authors != []):
          authors = authors[0].split(",")
          taglist.extend(authors)
            
        self.highWords.union(set(taglist))
        taglist.append("like")
        if (taglist != []):
          self.allTags.append(taglist)
      #print(self.allTags)
      for index, row in low.iterrows():
        print("check")
        taglist = all_tags[all_tags.goodreads_book_id == row['book_id']].tag_name.tolist()
        
        authors = booksRaw[booksRaw.book_id == row['book_id']].authors.tolist()
        
        if (authors != []):
          authors = authors[0].split(",")
          taglist.extend(authors)
        
        self.lowWords.union(set(taglist))
        taglist.append("dislike")
        if (taglist != []):
          self.allTags.append(taglist)
      
      #print(self.allTags)
      intersection = self.highWords.intersection(self.lowWords)
      for i in self.allTags:
        j = 0
        while j < len(i):
          #print(i)
          #print(j)
          if i[j] in intersection:
            i.pop(j)
          j += 1

user2 = UserProfile(17566)
user2.makeUserProfile()
print(user2.allTags)

We use the apriori algorithm to find association rules across all the keyword lists. It essentially works by finding frequent sets of words and keeping track of how often they appear together.

In [None]:
print("Length of highTags: ", sum(len(tag) for tag in user2.allTags))
allItemsets, allRules = list(apriori(user2.allTags, min_confidence=0.3,min_support=0.13))
print("Without minimum confidence and support, there would be over rules! Here we narrow it down to just", len(allRules),".")
print(allRules)

We only keep the rules that associate sets of keywords with high or low ratings, and we use these rules to find relevant books (books that fire many rules).

In [None]:
highRules = set(filter(lambda rule: 'like' in rule.rhs and len(rule.rhs) == 1, allRules))
lowRules = set(filter(lambda rule: 'dislike' in rule.rhs and len(rule.rhs) == 1, allRules))

total = 0
rules_fired = 0
predicted_ratings = []

#for book in all_tags_grouped['goodreads_book_id'].unique():
for book in all_tags_grouped['goodreads_book_id'].unique():
    #print(book)
    explanation = []
    tags = set(all_tags_grouped.get_group(book[0])['tag_name'].unique())
    #print(list(highRules))
    for rule in highRules:
        rule_words = set([key for key in rule.lhs])
        #print("flag: ", book)
        if rule_words.issubset(tags):
            
            explanation.extend([key for key in rule.lhs])
            total += 1
            
    for rule in lowRules:
        rule_words = set([key for key in rule.lhs])
        
        if rule_words.issubset(tags):
            explanation.extend([key for key in rule.lhs])
            total -= 1
            
    predicted_ratings.append([booksRaw[booksRaw["goodreads_book_id"] == book[0]].title,explanation,total])
    total = 0
    #print("The predicted rating for book ", book[0], "is ", predicted_rating)
predicted_ratings = pd.DataFrame(predicted_ratings,columns=["book_name","explanation","predicted_rating"])
predicted_ratings

All thats left is to rescale the predicted ratings, and sort them in decending order. To get the top-k recommendations, just return the highest k entries of this list.

In [None]:
min = predicted_ratings["predicted_rating"].min()
max = predicted_ratings["predicted_rating"].max()

def minmax(x):
    return (x-min)/(max-min)

predicted_ratings["rescaled_rating"] = predicted_ratings["predicted_rating"].apply(minmax)
predicted_ratings
predicted_ratings.sort_values("rescaled_rating",ascending=False)