In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import re
import datetime as dt
from tqdm import tqdm
import matplotlib.pyplot as plt

'''#import plotly for interactive chart
import plotly.plotly as py
import plotly
plotly.tools.set_credentials_file(username='richwolff', api_key='v0qPC120X33yPvAMDQXi')
from plotly.graph_objs import * '''

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

from collections import defaultdict
%matplotlib inline

<h2>Load books into dataframe</h2>

In [2]:
d = defaultdict(list)

strtofind = r'";"'
with open('../data/raw/BX-Books.csv','r',encoding='8859') as file:
    for i,line in enumerate(file):
        d[i] = re.sub(strtofind,'||',line.replace('&amp;','&')).replace('"','').replace('\n','').split('||')
        
books_df = pd.DataFrame(data=list(d.values())[1:],index=list(d.keys())[1:],columns=d[0])
books_df.head()

del d

<H2> Load Users Into DF </H2>

In [3]:
## Load users file and display first 5 rows
users_df = pd.read_csv('../data/raw/BX-Users.csv',sep=';',encoding='8859')
users_df.tail(5)

Unnamed: 0,User-ID,Location,Age
278853,278854,"portland, oregon, usa",
278854,278855,"tacoma, washington, united kingdom",50.0
278855,278856,"brampton, ontario, canada",
278856,278857,"knoxville, tennessee, usa",
278857,278858,"dublin, n/a, ireland",


<h2>Load User Ratings Of Books</h2>

In [4]:
ratings_df = pd.read_csv('../data/raw/BX-Book-Ratings.csv',sep=';',encoding='8859',dtype={'Book-Rating':np.int}).sort_values('User-ID')
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
9561,2,195153448,0
9562,7,34542252,0
9572,8,771025661,0
9580,8,1881320189,7
9579,8,1575663937,6


In [5]:
## Join user and book data to ratings data
ratings = ratings_df.set_index('User-ID').join(users_df.set_index('User-ID')).reset_index().set_index('ISBN').join(books_df.set_index('ISBN'))

## Split out users from the USA
us_ratings = ratings[(ratings['Location'].str.lower().str.contains('usa')) | (ratings['Location'].str.lower().str.contains('states'))].reset_index()
us_ratings.head()

Unnamed: 0,ISBN,User-ID,Book-Rating,Location,Age,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,0375404120,266865,0,"reston, virginia, usa",33.0,,,,,,,
1,)440206529,238681,0,"milford, ohio, usa",,,,,,,,
2,)452273056,111422,8,"avon, massachusetts, usa",59.0,,,,,,,
3,*0515128325,190925,0,"hobe sound, florida, usa",51.0,,,,,,,
4,/8741060773,52796,9,"sumner, iowa, usa",0.0,,,,,,,


<h2>Split data into training and test sets</h2>

In [6]:
ratings_us_training,ratings_us_test = train_test_split(us_ratings,test_size=.20,random_state=21)

<h2>Insert data into graph data structure</h2>

In [7]:
# Build the graph structure
G = nx.from_pandas_edgelist(ratings_us_training,'User-ID','ISBN',['Book-Rating'])

# Add Meta Data
for i,row in ratings_us_training.iterrows():
    user_node = G.node[row['User-ID']]
    book_node = G.node[row['ISBN']]
    
    user_node['Age'] = row['Age']
    user_node['Location'] = row['Location']
    user_node['bipartite'] = 'user'
    book_node['bipartite'] = 'book'
    book_node['Book-Title'] = row['Book-Title']
    book_node['Book-Author'] = row['Book-Author']
    book_node['Publisher'] = row['Publisher']
    book_node['Publication_Year'] = row['Year-Of-Publication']

#Add user nodes
user_nodes = set(ratings_us_training['User-ID'].unique())
book_nodes = set(ratings_us_training['ISBN'].unique())

<h2> Add Degree Centrality To nodes </h2>

In [8]:
dcs = nx.bipartite.degree_centrality(G,user_nodes)
for key,value in dcs.items():
    G.node[key]['dcs'] = value

<h2>Create a user and book biadjacency matrix with users as rows and books as columns </h2>

In [9]:
#using the rating as weight to add weight to explicit reviews to similarity scores
user_arr = np.array(list(user_nodes))
books_arr = np.array(list(book_nodes))
user_adj_matrix = nx.bipartite.biadjacency_matrix(G,row_order=user_nodes,column_order=book_nodes,weight='rating')
book_adj_matrix = user_adj_matrix.T

<h2>Create a user x user matrix with the cosine similarities as their intersection value</h2>

In [10]:
# Take cosine similarities of users based on ratings they've given each book (column)
user_sims = cosine_similarity(user_adj_matrix,dense_output=False)
user_sims.setdiag(0)
user_sims_coo = user_sims.tocoo()

<h2>Create a book x book matrix with cosine similarities as their intersection values</h2>

In [11]:
book_sims = cosine_similarity(book_adj_matrix,dense_output=False)
book_sims.setdiag(0)
book_sims_coo = book_sims.tocoo()

In [None]:
def bk_mtx_ind(coo_mtx):
    '''Loop through coordinate matrix rows and store the idx location of values in a dictionary
       dict[row].append(idx) 
       
       This will allow for fast lookups later vs looping through a 300MM list thousands of times later
       
       Move complexity from O(n^2) to O(N) (iterate matrix rows once, N every lookup is O(1) in python dicts)
    '''  
    bk_mtx_lkup = defaultdict(list)
    
    for i,bk in enumerate(coo_mtx.row):
        bk_mtx_lkup[bk].append(i)
        
    return bk_mtx_lkup

bk_mtx_lkup = bk_mtx_ind(coo_mtx=book_sims_coo)

In [12]:
def collaborative_filter(selected_user,node_list,user_matrix,top_n_similarities):
    
    def node_similiarities(node, node_list, matrix):
        '''Creates a numpy array of node similiarities (user or books)'''
        indices = np.where(matrix.row == np.where(node_list==node)[0])[0]
        matrix_sims_node = []
        matrix_sims_score = []
        nodes_sim = defaultdict(list)
        for idx in indices:
            cos_sim = (matrix.data[idx])
            nodes_sim[cos_sim].append(node_list[matrix.col[idx]])
        return nodes_sim
    
    def user_neighbor_books(selected_user, user_similarity_dict,top_n_similarities):
        '''accepts a 2d array with users in the first column and similarities in the 2nd
           returns top 10 books with scores'''
        books = defaultdict(lambda: defaultdict(float))
        for key in sorted(user_similarity_dict.keys(),reverse=True)[:top_n_similarities]:
            for usr_lookup in user_similarity_dict[key]:
                for bk in set(G.neighbors(usr_lookup)).difference(G.neighbors(selected_user)):
                    book_rating = G[usr_lookup][bk]['Book-Rating']
                    books[bk]['count'] += 1
                    books[bk]['cosine'] += key
                    books[bk]['rating'] += book_rating
                    books[bk]['implicit_ratings'] += 1 if book_rating == 0 else 0
                    books[bk]['explicit_ratings'] += 1 if book_rating > 0 else 0
                    books[bk]['avg_cosine'] = books[bk]['cosine']/books[bk]['count']  
                    books[bk]['avg_rating'] = books[bk]['rating']/books[bk]['count']
                    if books[bk]['explicit_ratings'] > 0:
                        books[bk]['avg_explicit_rating'] = books[bk]['rating']/books[bk]['explicit_ratings']
        
        return books
    
    def books_dict_to_df(books_list):
        books_list = [(b,
               d['avg_rating'],
               d['avg_explicit_rating'],        
               d['avg_cosine'],
               d['count'],
               d['cosine'],
               d['rating'],
               d['implicit_ratings']) for b,d in zip(user_books_df.keys(),user_books_df.values())]
        df_columns = ['book','avg_rating','avg_explicit_rating','avg_cosine','user_count','cosines','ratings','implicit_ratings']
        return pd.DataFrame(books_list,columns=df_columns)


    user_sims_nodes = node_similiarities(selected_user,user_arr,user_sims_coo)
    
    user_books_df = user_neighbor_books(selected_user=selected_user,
                                        user_similarity_dict=user_sims_nodes,
                                        top_n_similarities=top_n_similarities)
    
    return books_dict_to_df(user_books_df).set_index('book')

<h2>Test 1 user</h2>

In [13]:
selected_user = 69697
recommended_books = collaborative_filter(selected_user,user_arr,user_sims_coo,100)

In [14]:
# Pull actual purchases
sel_user_actual = ratings_us_test[ratings_us_test['User-ID']==selected_user]['ISBN'].to_frame()
sel_user_actual['read'] = 1

In [15]:
# TOP N Recommendation based on cosines
n_recommendations = 10
reco_test = recommended_books.sort_values(['cosines','avg_explicit_rating'],ascending=False).head(n_recommendations).join(sel_user_actual.set_index('ISBN'))
reco_test


Unnamed: 0_level_0,avg_rating,avg_explicit_rating,avg_cosine,user_count,cosines,ratings,implicit_ratings,read
book,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
515131229,2.84375,8.272727,0.073082,32.0,2.338618,91.0,21.0,1.0
515128554,4.821429,8.4375,0.07675,28.0,2.149009,135.0,12.0,1.0
515124214,3.041667,7.3,0.079607,24.0,1.910558,73.0,14.0,
515130389,0.916667,7.333333,0.076691,24.0,1.840573,22.0,21.0,
515127833,2.458333,7.375,0.076458,24.0,1.834995,59.0,16.0,
515121843,2.782609,8.0,0.078686,23.0,1.809771,64.0,15.0,
425178579,2.761905,8.285714,0.084919,21.0,1.783294,58.0,14.0,1.0
316666343,4.125,9.0,0.073985,24.0,1.775636,99.0,13.0,
373243790,1.714286,7.2,0.083506,21.0,1.753624,36.0,16.0,
553265741,2.05,6.833333,0.085245,20.0,1.704899,41.0,14.0,


<h2>Bring back scores for multiple users in test file</h2>

In [None]:
top_n_cosines = 50 
n_recommendations = 15
test_ratings_user_count = 50

test_ratings_user_valcount = ratings_us_test['User-ID'].value_counts()
filtered_test_ratings = test_ratings_user_valcount[test_ratings_user_valcount>test_ratings_user_count]
users_to_test = filtered_test_ratings.index.values

metrics = np.empty(len(users_to_test))

for i,sel_user in enumerate(users_to_test):
    recommended_books = collaborative_filter(sel_user,user_arr,user_sims_coo,top_n_cosines)
    sel_user_actual = ratings_us_test[ratings_us_test['User-ID']==sel_user]['ISBN'].to_frame()
    sel_user_actual['read'] = 1
    reco_test = recommended_books.sort_values(['cosines','avg_rating'],ascending=False).head(n_recommendations).join(sel_user_actual.set_index('ISBN'))
    metrics[i] = np.sum(reco_test['read'])/n_recommendations

In [None]:
x = sorted(metrics)
n = len(x)
y = np.arange(1, n+1)/n

plt.plot(x,y);
plt.title('Median: {}'.format(np.median(metrics)));

In [16]:
arr1 = np.array([12,56,77,3])
arr1

array([12, 56, 77,  3])

In [17]:
arr2 = np.array([11,2,4,3,5,12])
arr2

array([11,  2,  4,  3,  5, 12])

In [26]:
np.i(arr1,arr2,assume_unique=True)

array([ 3, 12])

In [30]:
np.where(np.isin(arr1,arr2))

((array([0, 3]),), (array([0]),))

In [23]:
np.where(arr112)

(array([0]),)

<H1>DONT DELETE AFTER THIS. FOR BOOK SIMILARITIES</H1>

In [59]:
def book_collab_filter(selected_user,node_list,book_matrix,book_matrix_lookup,top_n_similarities):
    
    def node_similiarities(nodes, node_list, matrix,book_matrix_lookup): #### O(n^2), can i reduce this?
        '''Creates a numpy array of node similiarities (user or books)'''
        nodes_sim = defaultdict(list)
        
        # find the positions of all the books read by the user
        book_positions = np.where(np.isin(node_list,nodes))[0]
        
        #For each data point from node, append value book to dictionary key cosine similarity
        for i,pos in enumerate(book_position):
            for idx in book_matrix_lookup[pos]:
                bk = node_list[matrix.col[idx]]
                if not bk in nodes:
                    nodes_sim[matrix.data[idx]].append(bk)
                
        return nodes_sim
    
    def user_neighbor_books(book_similarity_dict, top_n_similarities):
        '''accepts a 2d array with users in the first column and similarities in the 2nd
           returns top 10 books with scores'''
        books = defaultdict(lambda: defaultdict(float))
        for key in sorted(book_similarity_dict.keys(),reverse=True)[:top_n_similarities]:
            for bk in book_similarity_dict[key]:
                books[bk]['count'] += 1
                books[bk]['cosine'] += key
                books[bk]['avg_cosine'] = books[bk]['cosine']/books[bk]['count']  
                books[bk]['avg_rating'] = books[bk]['rating']/books[bk]['count']
        return books
    
    def books_dict_to_df(books_list):
        books_list = [(b,
               d['avg_rating'],        
               d['avg_cosine'],
               d['count'],
               d['cosine']) for b,d in zip(book_books_df.keys(),book_books_df.values())]
        df_columns = ['book','avg_rating','avg_cosine','book_count','cosines']
        return pd.DataFrame(books_list,columns=df_columns)
    
    ## return top cosine similarities books based on books user read
    
    ## For books user read
        # Get books with high cosine similarities to each book
        # count and store in dict/data frame

    ### GET BOOK LIST
    dfa = defaultdict(list)
    tst = set()
    for x in list(G.neighbors(selected_user)):
        dfa[G[selected_user][x]['Book-Rating']].append(x)  
        
    top_5_keys = sorted(dfa.keys())[-5::]
    top_5_rated_books = []
    for x in top_5_keys:
        top_5_rated_books += dfa[x]
    
    
    book_sims_nodes = node_similiarities(np.array(top_5_rated_books),node_list,book_matrix,book_matrix_lookup)
    
    book_books_df = user_neighbor_books(book_similarity_dict=book_sims_nodes,
                                        top_n_similarities=top_n_similarities)
    
    return books_dict_to_df(book_books_df).set_index('book')

In [55]:
len(list(G.neighbors(selected_user)))

1534

In [56]:
selected_user = 69697
res = book_collab_filter(selected_user,books_arr,book_sims_coo,book_matrix_indices,20)

In [57]:
res[:10]

Unnamed: 0_level_0,avg_rating,avg_cosine,book_count,cosines
book,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0380787164,0.0,0.823069,245.0,201.651804
1551660628,0.0,0.711733,117.0,83.272731
037327212X,0.0,0.866025,1.0,0.866025
0821720589,0.0,0.766346,2.0,1.532692
0373763158,0.0,0.816497,1.0,0.816497
0373259832,0.0,0.743524,17.0,12.639903
0843951699,0.0,0.718686,3.0,2.156059
0515133663,0.0,0.718686,3.0,2.156059
0451209885,0.0,0.718686,3.0,2.156059
0451208838,0.0,0.718686,3.0,2.156059


In [58]:
res.sort_values('avg_cosine',ascending=False)[:100]

Unnamed: 0_level_0,avg_rating,avg_cosine,book_count,cosines
book,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
037327212X,0.0,0.866025,1.0,0.866025
0380787164,0.0,0.823069,245.0,201.651804
0373763158,0.0,0.816497,1.0,0.816497
0426204425,0.0,0.816497,1.0,0.816497
0195073541,0.0,0.816497,1.0,0.816497
1585790044,0.0,0.816497,1.0,0.816497
3822894532,0.0,0.816497,1.0,0.816497
0345339517,0.0,0.816497,1.0,0.816497
0571147267,0.0,0.816497,1.0,0.816497
0517527413,0.0,0.816497,1.0,0.816497


In [60]:
# Pull actual purchases
sel_user_actual = ratings_us_test[ratings_us_test['User-ID']==selected_user]['ISBN'].to_frame()
sel_user_actual['read'] = 1

In [66]:
# TOP N Recommendation based on cosines
n_recommendations = 20
reco_test = res.sort_values(['book_count','avg_cosine'],ascending=False).head(n_recommendations).join(sel_user_actual.set_index('ISBN'))
reco_test

Unnamed: 0_level_0,avg_rating,avg_cosine,book_count,cosines,read
book,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0380787164,0.0,0.823069,245.0,201.651804,
1551660628,0.0,0.711733,117.0,83.272731,
0373258755,0.0,0.707795,101.0,71.487262,
0373259832,0.0,0.743524,17.0,12.639903,
0821713280,0.0,0.707107,10.0,7.071068,
0821765272,0.0,0.707107,10.0,7.071068,
037381013X,0.0,0.707107,10.0,7.071068,
0373219490,0.0,0.707107,10.0,7.071068,
0373251211,0.0,0.707107,10.0,7.071068,
0843940735,0.0,0.707107,10.0,7.071068,


In [None]:
top_n_cosines = 50 
n_recommendations = 15
test_ratings_user_count = 50

test_ratings_user_valcount = ratings_us_test['User-ID'].value_counts()
filtered_test_ratings = test_ratings_user_valcount[test_ratings_user_valcount>test_ratings_user_count]
users_to_test = filtered_test_ratings.index.values

metrics = np.empty(len(users_to_test))

for i,sel_user in enumerate(users_to_test):
    recommended_books = collaborative_filter(sel_user,user_arr,user_sims_coo,top_n_cosines)
    sel_user_actual = ratings_us_test[ratings_us_test['User-ID']==sel_user]['ISBN'].to_frame()
    sel_user_actual['read'] = 1
    reco_test = recommended_books.sort_values(['cosines','avg_rating'],ascending=False).head(n_recommendations).join(sel_user_actual.set_index('ISBN'))
    metrics[i] = np.sum(reco_test['read'])/n_recommendations

In [None]:
def user_book_cosines(usr):
    read_books = list(G[usr])
    books = defaultdict(lambda: defaultdict(float))
    for bk in read_books:
        book_nodes,book_scores = node_similiarities(bk,books_arr,book_sims_coo)
        for book,score in zip(book_nodes,book_scores):
            books[book]['cosine_total'] +=score
            books[book]['count'] += 1
            books[book]['avg_cosine'] = books[book]['cosine_total']/books[book]['count']
        
    return books

book_list = user_book_cosines(selected_user)
book_list = [(b,
               d['cosine_total'],
               d['count'],
               d['avg_cosine']) for b,d in zip(book_list.keys(),book_list.values())]
book_book_df = pd.DataFrame(book_list,columns=['book','book_book_cosine','book_book_count','book_book_avgcosine'])
book_book_df.sort_values('book_book_count',ascending=False,inplace=True)
book_book_df.set_index('book',inplace=True)


In [None]:
list(G[selected_user])[0]

In [None]:
list(G.neighbors('034542252'))

In [None]:
len(list(G.neighbors(166409)))

In [None]:
user_scores[user_scores>np.percentile(user_scores,1)]

In [None]:
book_reco

<h2>Build books into dataframe</h2>

In [None]:
book_book_df.head()

In [None]:
tst = book_reco.join(book_book_df,how='outer')

np.percentile(tst['avg_user_cosines'].fillna(0),99.9) #book_book_avgcosine, #avg_user_cosines

In [None]:
sel_user_actual = ratings_us_test[ratings_us_test['User-ID']==selected_user]['ISBN'].to_frame()
sel_user_actual['read'] = 1
tst1 = tst.join(sel_user_actual.set_index('ISBN'),how='outer')
np.sum(tst1['read']>0)/len(tst1)

In [None]:

tst1['book_book_avgcosine'].hist()

In [None]:
tst1['avg_user_cosines'].hist()

In [None]:
ratings_df['User-ID'].value_counts()[ratings_df['User-ID'].value_counts()<3].sum()/len(ratings_df['User-ID'].unique())

In [None]:
ratings_us_test['User-ID'].unique()

In [None]:
book_reco_filtered = book_reco[book_reco['user_count'] >= 10]
book_reco_filtered['avg_user_cosine'] = book_reco_filtered['user_cosines']/book_reco_filtered['user_count']
top_10_books = set(book_reco_filtered.sort_values('avg_user_cosine',ascending=False).iloc[:10]['book'])
top_10_books.intersection(set(ratings_us_test[ratings_us_test['User-ID']==selected_user].sort_values('ISBN')['ISBN']))
#top_10_books