In [2]:
import graphlab as gl
import numpy as np
import math

In [4]:
book_data = gl.load_sframe("./book_data_clean/")

In [6]:
book_data

book_id,title,author,year,publisher
195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
374157065,Flu: The Story of the Great Influenza Pandemic ...,Gina Bari Kolata,1999,Farrar Straus Giroux
393045218,The Mummies of Urumchi,E. J. W. Barber,1999,"W. W. Norton &amp, Company ..."
399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group
425176428,What If?: The World's Foremost Military ...,Robert Cowley,2000,Berkley Publishing Group
671870432,PLEADING GUILTY,Scott Turow,1993,Audioworks
679425608,Under the Black Flag: The Romance and the Reality ...,David Cordingly,1996,Random House
074322678X,Where You'll Find Me: And Other Stories ...,Ann Beattie,2002,Scribner


In [4]:
rating_data = gl.load_sframe("./explicit_rating_data/")

In [5]:
rating_data = rating_data.unstack(["book_id", "ratings"], "book_rating")

In [6]:
rating_data

user_id,book_rating
21855,{'0525464689': 5}
237313,"{'0061031844': 4, '0380775123': 10, ..."
144280,"{'0385497466': 9, '0671004549': 8, ..."
219673,{'0440212359': 7}
26319,"{'8881123339': 8, '8880000012': 10, ..."
66110,{'0689831285': 10}
179149,{'089087333X': 7}
171625,{'0515121843': 10}
262565,"{'0486406547': 8, '0679734775': 8} ..."
115341,"{'0440217512': 8, '0679781587': 4} ..."


In [None]:
"""
#code to convert rating SFrame to a dictionary with user_id as key and another dictionary as value.
#Other dictionary contain key:value pair as book_id:rating corresponding to ratings given by a user to resp. books
count = 0
critics = {}
for user in rating_data["user_id"]:
    if count % 100 == 0: 
        print 1,
    count += 1
    critics[str(user)] = rating_data[rating_data["user_id"] == user]["book_rating"][0]
     
#Computing critics dictionary takes considerable time, therefore it is better to save dictionary after computation
#so that threre is no need to compute it again

np.save("rating_dictionary.npy", critics)
"""

In [2]:
critics = np.load("rating_dictionary.npy").item()

In [4]:
# Returns a distance based similarity score based for user1 and user2
# Score between (0-1) score 1 means distance zero, higher the score more similar the users are
def euclid(ratings, user1, user2):
    flag = 0
    for item in ratings[user1]:
        if item in ratings[user2]:
            flag = 1; break
            
    # if no ratings in common, return 0
    if flag == 0: return 0
    
    # Add up the squares of all differences
    sum_squares = sum([pow(ratings[user1][item]-ratings[user2][item],2) 
                   for item in ratings[user1] if item in ratings[user2]])
    
    return 1/(1+sum_squares) 

In [None]:
critics.items()

In [5]:
# Returns pearson corelation coefficient for user1 and user2
# Score between -1 and 1 more score means more similarity b/w users 
def pearson(rats, user1, user2):
    # List of rated items
    shared_items = {}
    for item in rats[user1]:
        if item in rats[user2]:
            shared_items[item] = 1
            
    n = len(shared_items)
    # if no common item, return 0
    if n == 0: return 0
    
    # Add up all the ratings
    sum1 = sum([rats[user1][item] for item in shared_items])
    sum2 = sum([rats[user2][item] for item in shared_items])
    
    # Sum up all the squares of ratings
    sum1Sq = sum([pow(rats[user1][item],2) for item in shared_items])
    sum2Sq = sum([pow(rats[user2][item],2) for item in shared_items])
    
    # Sum up all the products
    prodSum = sum([rats[user1][item]*rats[user2][item] for item in shared_items]) 
    
    # Calculate pearson score
    num = prodSum - (sum1*sum2/n)
    temp = math.sqrt((sum1Sq - pow(sum1,2)/n) * (sum2Sq - pow(sum2,2)/n))
    if temp == 0: return 0
    
    score = num/temp
    return score
    

In [18]:
count = -1
for key in critics.keys():
    if count == 5: break
    res = euclid(critics, "36606", key)
    if res != 0:
        count += 1
        print key, res, count
        

36606 1 0
245827 1 1
60905 1 2
57661 1 3
21937 1 4
201404 1 5


In [89]:
"""
Computing similarity of one user to every other user in dataset
This function will return a list of tuples with tuples containing similarity and id of the user
NOTICE THAT THE FUNCTION IS CALLED BY USING ONE OF THE ARGUMENT (NOT THE NAME OF FUNCTION ITSELF) 
ALSO NOTE THAT [PEARSON] IS [NOT] PASSED AS A STRING BUT AS A [FUNCTION] TO METHOD

This function returns (n) most similar users where n is the number of movies we want our recommender to recommend,
(n) here can be increased to get even better results
"""

from operator import itemgetter
def getSimilarUsers(ratings, user, n=50, method=pearson):
    sim = [(other, method(ratings, user, other)) for other in ratings if other!=user]
    
    # Sort list so that more similar users appear at top
    sim = sorted(sim, key=itemgetter(1), reverse=True)
    
    # If method used is pearson and first similarity is 0 means no similar user found, use euclid in such case
    if method == pearson and sim[0][1] == 0:
        sim = similarity_list(ratings, user, n, method=euclid)
    
    # n denotes number of results to be returned
    return sim[0:n]
        

In [90]:
def getRecommendations(ratings, user, n=5, method=pearson):
    totals = {}
    simSums = {}
    # Get a list of n most similar users
    similar_users = getSimilarUsers(ratings, user, n*10, method)
    
    # For every similar user in similar_users rate the movie that user has'nt rated yet
    for similar in similar_users:
        other = similar[0]
        sim = similar[1]
        # if similarity less than 0, ignore
        if(sim <= 0): continue
            
        for item in ratings[other]:
            # only score movies user hasn't seen yet
            if item not in ratings[user] or ratings[user][item] == 0:
                # similarity * other user rating
                totals.setdefault(item, 0)
                totals[item] += ratings[other][item]*sim
                # sum of similarities
                simSums.setdefault(item, 0)
                simSums[item] += sim
    
    # Normalize predicted ratings and store then as tuples in a list
    rankings = [(item, total/simSums[item]) for item,total in totals.items()]
    rankings = sorted(rankings, key=itemgetter(1), reverse=True)
    return rankings[0:n]
    

In [98]:
def recommend(ratings, user, n=5, method=pearson):
    ids_ratings = getRecommendations(ratings, user, n+50, method)
    #list storing details of recommended books
    list_of_books = []
    
    # Serach a book via its id in book_data and append all its details along with rating to list_of_books
    count = 0
    for item in ids_ratings:
        if count == n: break
        # if book details not present in book_data, skip over to next until (n) books are appended to list
        if item[0] not in book_data["ISBN"]: continue
            
        count += 1
        book = book_data[book_data["ISBN"] == item[0]][0]
        if item[1] > 10:
            book["rating"] = 10
        else:
            book["rating"] = item[1]
        # delete book id from dctionary
        del(book["ISBN"])
        list_of_books.append(book)
        
    return list_of_books[0:n]
    
    

In [71]:
book_data[book_data["ISBN"] == "195153448"][0]

{'Book-Author': 'Mark P. O. Morford',
 'Book-Title': 'Classical Mythology',
 'ISBN': '195153448',
 'Publisher': 'Oxford University Press',
 'Year-Of-Publication': 2002}

In [99]:
lis = recommend(critics, "36606", method=pearson)

In [100]:
lis

[{'Book-Author': 'Chaim Potok',
  'Book-Title': 'My Name Is Asher Lev',
  'Publisher': 'Anchor Books/Doubleday',
  'Year-Of-Publication': 2003,
  'rating': 10},
 {'Book-Author': 'James Mellon',
  'Book-Title': 'Bullwhip Days: The Slaves Remember',
  'Publisher': 'Pub Group West',
  'Year-Of-Publication': 1988,
  'rating': 10},
 {'Book-Author': 'Pearl S. Buck',
  'Book-Title': 'Pavilion of Women (Oriental Novels of Pearl S. Buck)',
  'Publisher': 'Alliance House Inc',
  'Year-Of-Publication': 1990,
  'rating': 10},
 {'Book-Author': 'Robin. Morgan',
  'Book-Title': 'Going Too Far: The Personal Chronicle of a Feminist',
  'Publisher': 'Random House Inc',
  'Year-Of-Publication': 1978,
  'rating': 10},
 {'Book-Author': 'Janette Oke',
  'Book-Title': 'Return to Harmony',
  'Publisher': 'Bethany House Publishers',
  'Year-Of-Publication': 1996,
  'rating': 10}]