# Computing cooccurrence matrix

In [5]:
import graphlab as gl
import numpy as np

In [8]:
explicit_data = gl.load_sframe("explicit_rating_data")

[INFO] graphlab.cython.cy_server: GraphLab Create v2.0.1 started. Logging: /tmp/graphlab_server_1470115028.log


This non-commercial license of GraphLab Create for academic use is assigned to rahultyagirt7@outlook.com and will expire on June 03, 2017.


In [9]:
explicit_data = explicit_data.unstack(["user_id", "ratings"], "user/rating")

In [10]:
explicit_data

book_id,user/rating
445200456,"{225968: 8, 41589: 8, 165758: 5, 252695: 8} ..."
1863735313,{225232: 8}
966828917,{135778: 8}
142180106,"{208619: 10, 19981: 8}"
918668034,{123094: 7}
9722214683,{78973: 7}
345274563,"{81560: 8, 7346: 9, 239748: 6, 83685: 7, ..."
1885843070,{194600: 8}
60952938,"{59656: 8, 32773: 6}"
1573228516,"{30257: 9, 260183: 6, 263460: 10, 154543: 5} ..."


In [13]:
big_data, small_data = explicit_data.random_split(.95, seed=0)
small_data.head(10)

book_id,user/rating
192824392,{227422: 7}
199102058,{140758: 10}
30595746,{108997: 8}
8426430294,{217740: 4}
712666745,{256688: 7}
3442120772,"{57435: 10, 167166: 10}"
2130490921,{139480: 6}
2744147737,{192068: 1}
465050522,"{238890: 10, 43500: 8}"
897211499,{180802: 8}


In [62]:
# This dictionary will store count of total users that liked  value for each book
normalize_dict = {}
for book in small_data:
    res = sum([1 for rate in book["user/rating"].values() if rate > 5])
    normalize_dict.setdefault(book["book_id"], 0)
    normalize_dict[book["book_id"]] = res
        

In [75]:
"""
Cooccurrence matrix bulit but TOO SLOW... It took almost 1 hour to compute matrix on 5% of original data 

Dictionary is built with key as book1 id and value as another dictionary conataing book2 id as key and common 
readers b/w books as value
"""
master_dict = {}
for book1 in small_data:
    # flag used to skip master_dict to add empty temp_dict(with no common users)
    flag1 = 0
    temp_dict= {}
    
    for book2 in small_data:
        if book1 == book2: continue
        # To assert, at least one user is found in common b/w book1 and book2
        flag2 = 0
            
        # Check if user rated both the movies, if yes increase the count for these two movies
        for user in book2["user/rating"].keys():
            # users that likes book1 OR book2
            book1_or_book2 = normalize_dict[book1["book_id"]] + normalize_dict[book2["book_id"]]
            
            if user in book1["user/rating"].keys():
                #if rating <= 5 skip the book(user don't like the book)
                if book1["user/rating"][user] <= 5: continue 
                        
                flag1 = 1
                if book2["book_id"] not in temp_dict:
                    temp_dict.setdefault(book2["book_id"], 0)
                temp_dict[book2["book_id"]] += 1
                flag2 = 1
                
        # Normalizing values of common users through JACCARD SIMILARITY
        if flag2 == 1: temp_dict[book2["book_id"]] /= float(book1_or_book2)
    
    if flag1 == 1:
        master_dict[book1["book_id"]] = temp_dict

In [23]:
small_data[0]["user/rating"].values()

[7]

In [18]:
np.save("cooccurrence dict.npy", master_dict)

# Recommending books via cooccurence matrix

In [1]:
import graphlab as gl
import numpy as np

A newer version of GraphLab Create (v2.1) is available! Your current version is v2.0.1.
You can use pip to upgrade the graphlab-create package. For more information see https://turi.com/products/create/upgrade.


In [2]:
co_dict = np.load("cooccurrence dict.npy").item()

In [4]:
key_list = co_dict.keys() 
value_list = co_dict.values()

In [5]:
arr1 = gl.SArray(key_list)
arr2 = gl.SArray(value_list)

This non-commercial license of GraphLab Create for academic use is assigned to rahultyagirt7@outlook.com and will expire on June 03, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.0.1 started. Logging: /tmp/graphlab_server_1470116190.log


In [15]:
matrix = gl.SFrame({"book1": arr1, "common": arr2})

In [16]:
matrix.head(3)

book1,common
756401836,"{'0843953055': 0.25, '0440241871': 0.25, ..."
8804522224,"{'0891909540': 0.3333333333333333, ..."
205131999,{'0888995903': 0.5}


In [17]:
matrix = matrix.stack("common", new_column_name=["book2", "similarity"])

In [20]:
rating_data = gl.load_sframe("explicit_rating_data/")

In [23]:
rating_data = rating_data[rating_data["ratings"] > 5]
rating_data.materialize()

In [24]:
rating_data

user_id,book_id,ratings
276729,0521795028,6
276736,3257224281,8
276737,0600570967,6
276744,038550120X,7
276745,342310538,10
276747,0060517794,9
276747,0671537458,9
276747,0679776818,8
276747,0943066433,7
276747,1885408226,7


In [29]:
unique_user = rating_data[rating_data["user_id"] == 276747]
unique_user.materialize()

In [30]:
unique_user

user_id,book_id,ratings
276747,60517794,9
276747,671537458,9
276747,679776818,8
276747,943066433,7
276747,1885408226,7


In [31]:
bought_books = list(unique_user["book_id"])

In [32]:
bought_books

['0060517794', '0671537458', '0679776818', '0943066433', '1885408226']

## Idea of unique user from rating data failed, see if rating dictionary works

In [3]:
import graphlab as gl
import numpy as np
from operator import itemgetter

In [4]:
rating = np.load("rating_dictionary.npy").item()
cooccur = np.load("cooccurrence dict.npy").item()

In [15]:
"""
Using co_dict rather than matrix SFrame (constructed using co_dict), this will make computation much more efficient 
score list store keys in the corpus and scores on the basis for reading history of user

Our cooccurrence dictionary is really sparse (5% of original data) hence we are only able to find recommendation
just for 15 users out of 100 users(for which we tried to compute recommendation).
To increase the number of users which get recommendations, cooccur dictionary must be computed for other 95% data

This function will loops over all the users present in rating dictionary and will SKIP those user for which no 
similar movies are found.

n-> denotes the maximum number of books to be recommended to a user
"""
def co_recommender(rating_dict, co_dict, userId=None, n=5):
    recom_books = {}
    
    # Rating dictionary stores user as keys and another dictionary as values
    # containing (book/corresponding ratings give by user) as key/value pair
    if userId in rating_dict.keys():
        user_rating = rating_dict[userId]
        score = []
        flag = 0
    
        # co_dict contains book_ids as keys and another dict as values containing
        # book_ids and normalized similarity between those books(as key/value pair)
        # Loop over all the books in the inventory
        for bookId,book_sim in co_dict.items():
            temp = 0
            
            # Loop over all the previouly rated book by a user and add the similarity b/w 
            # current book and EACH of the previously rated book.
            # Compute final score by dividing total number of books user has already rated
            for prev_rated in user_rating.keys(): 
                if prev_rated in book_sim.keys():
                    temp += book_sim[prev_rated]
                    
            if temp != 0:
                # To NORMALIZE score, divide score by total number of previouly rated books 
                temp /= len(user_rating)
                flag = 1
                score.append((bookId, temp))
        score = sorted(score, key=itemgetter(1), reverse=True)[0:n]
    
        if flag == 1:
            recom_books.setdefault(userId, 0)
            recom_books[userId] = score
    return recom_books[userId]
    

In [64]:
recom = co_recommender(rating, cooccur, userId="103541")

In [65]:
recom

[('3453096428', 0.007745726495726496),
 ('3453109546', 0.007745726495726496),
 ('3453126777', 0.004084967320261438),
 ('0552146161', 0.0022153092006033186),
 ('3499135566', 0.001736111111111111)]

In [None]:
book_data = gl.SFrame("./csv_files/BX-Books.csv")

In [57]:
book_data = book_data["ISBN", "Book-Title", "Book-Author", "Year-Of-Publication", "Publisher"]
book_data.rename({"ISBN":"book_id", "Book-Title":"title", "Book-Author":"author", "Year-Of-Publication":"year",
                      "Publisher":"publisher"})

book_id,title,author,year,publisher
195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
374157065,Flu: The Story of the Great Influenza Pandemic ...,Gina Bari Kolata,1999,Farrar Straus Giroux
393045218,The Mummies of Urumchi,E. J. W. Barber,1999,"W. W. Norton &amp, Company ..."
399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group
425176428,What If?: The World's Foremost Military ...,Robert Cowley,2000,Berkley Publishing Group
671870432,PLEADING GUILTY,Scott Turow,1993,Audioworks
679425608,Under the Black Flag: The Romance and the Reality ...,David Cordingly,1996,Random House
074322678X,Where You'll Find Me: And Other Stories ...,Ann Beattie,2002,Scribner


In [66]:
total_list_books = []
total_list_ids = []
if recom:
    for item in recom:
        bookId = item[0]
        if bookId in book_data["book_id"]:
            book_info = book_data[book_data["book_id"] == bookId][0]
            total_list_ids.append(book_info["book_id"])
            del(book_info["book_id"])
            total_list_books.append(book_info)

In [67]:
total_list_books

[{'author': 'Terry Pratchett',
  'publisher': 'Heyne',
  'title': 'Ein gutes Omen.',
  'year': 1997},
 {'author': 'Toni Morrison',
  'publisher': "Distribooks Int'l+inc",
  'title': 'Lazz',
  'year': 0}]