In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from nxviz.plots import ArcPlot,CircosPlot,MatrixPlot
import re
import datetime as dt

import matplotlib.pyplot as plt

#import plotly for interactive chart
#import plotly.plotly as py
#from plotly.graph_objs import *

from sklearn.metrics.pairwise import cosine_similarity

from collections import defaultdict
%matplotlib inline

<h2>Load books into dataframe</h2>

In [2]:
d = defaultdict(list)

strtofind = r'";"'
with open('../data/raw/BX-Books.csv','r',encoding='8859') as file:
    for i,line in enumerate(file):
        d[i] = re.sub(strtofind,'||',line.replace('&amp;','&')).replace('"','').replace('\n','').split('||')
        
books_df = pd.DataFrame(data=list(d.values())[1:],index=list(d.keys())[1:],columns=d[0])
books_df.head()

del d

<H2> Load Users Into DF </H2>

In [3]:
## Load users file and display first 5 rows
users_df = pd.read_csv('../data/raw/BX-Users.csv',sep=';',encoding='8859')
users_df.tail(5)

Unnamed: 0,User-ID,Location,Age
278853,278854,"portland, oregon, usa",
278854,278855,"tacoma, washington, united kingdom",50.0
278855,278856,"brampton, ontario, canada",
278856,278857,"knoxville, tennessee, usa",
278857,278858,"dublin, n/a, ireland",


<h2>Load User Ratings Of Books</h2>

In [4]:
ratings_df = pd.read_csv('../data/raw/BX-Book-Ratings.csv',sep=';',encoding='8859',dtype={'Book-Rating':np.int}).sort_values('User-ID')
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
9561,2,195153448,0
9562,7,34542252,0
9572,8,771025661,0
9580,8,1881320189,7
9579,8,1575663937,6


In [5]:
print('Unique users with reviews: {}'.format(len(ratings_df['User-ID'].unique())))
print('Unique books with reviews: {}\n'.format(len(ratings_df['ISBN'].unique())))
print('Total number of nodes: {}'.format(len(ratings_df['User-ID'].unique())+len(ratings_df['ISBN'].unique())))
print('Total number of edges: {}'.format(len(ratings_df)))

Unique users with reviews: 105283
Unique books with reviews: 340556

Total number of nodes: 445839
Total number of edges: 1149780


<h2>Users from different parts of the world can have very different book tastes (also books in differnet languages). In an attempt to bring the complexity of this graph down, I'll filter for users in the USA</h2>

In [6]:
usa_users = users_df[(users_df['Location'].str.contains('united states')) |(users_df['Location'].str.contains('usa'))]
ratings_usa = usa_users.set_index('User-ID').join(ratings_df.set_index('User-ID'),how='inner').reset_index()
ratings_usa.head()

Unnamed: 0,User-ID,Location,Age,ISBN,Book-Rating
0,2,"stockton, california, usa",18.0,195153448,0
1,7,"washington, dc, usa",,34542252,0
2,9,"germantown, tennessee, usa",,609804618,0
3,9,"germantown, tennessee, usa",,440234743,0
4,9,"germantown, tennessee, usa",,452264464,6


In [8]:
def training_test_df(df,test_size=.2,random_state=None):
    '''Split a dataframe into two data frames split by sample_size
       Input: df - pandas dataframe object 
              test_size - size of training set. Must be between 0 and 1'''
    assert type(df) is pd.DataFrame, "Argument 1 must be a dataframe"
    assert test_size >= 0 and test_size <=1, "test size must be between 0 and 1"
    training_mask = ~df.index.isin(df.sample(frac=test_size,random_state=random_state).index)
    test_mask = df.index.isin(df.sample(frac=test_size,random_state=random_state).index)
    
    training_df = df[training_mask]
    test_df = df[test_mask]
    
    return training_df,test_df
    
ratings_us_training, ratings_us_test = training_test_df(ratings_usa)

In [9]:
#Init Graph
G = nx.Graph()

#Add book nodes. Since some ratings contain books not stored in the books data frame, we'll update the set of books 
#from the books dataframe with the unique list of books in the ratings df.
#books = set(books_df['ISBN'])
books = set(ratings_us_training['ISBN'].unique())
G.add_nodes_from(books,bipartite='books')

#Add user nodes
#users = set(users_df['User-ID'])
users = set(ratings_us_training['User-ID'].unique())
G.add_nodes_from(users,bipartite='users')
for i,row in ratings_us_training.iterrows():
    G.node[row['User-ID']]['Age']=row['Age']

#Add User-book
for i, row in ratings_us_training.iterrows():
    G.add_edge(row[0],row[3],rating=row[4])

In [11]:
# Create lists of user nodes and book nodes from graph
nodes_from_attr = lambda G,attr,attr_value: [n for n,d in G.nodes(data=True) if d[attr] == attr_value]
users = nodes_from_attr(G,'bipartite','users')
books = nodes_from_attr(G,'bipartite','books')

In [12]:
#user projection graph
G_user = nx.bipartite.projected_graph(G,users)
#G_books = nx.bipartite.projected_graph(G,books)

In [49]:
#User projection graph weighted by number of neighbors (books) they share 
G_weighted_user = nx.bipartite.weighted_projected_graph(G,users)

In [70]:
weights = defaultdict(int)
for i,e in enumerate(G_weighted_user.edges(data=True)):
    weights[e[2]['weight']] +=1

In [79]:
print('Possible edges if all users connected to all users {}'.format(len(G_weighted_user.nodes())*len(G_weighted_user.nodes())))
print('Edges between users with books in common: {}'.format(len(G_weighted_user.edges())))
print('Edges between users with only 1 book in common: {}'.format(weights[1]))

Possible edges if all users connected to all users 2795871376
Edges between users with books in common: 6831609
Edges between users with only 1 book in common: 5739131


In [77]:
len(users)*len(users)

2795871376

In [43]:
#Get the bipartite degree centrality of the users and store that as an attribute on their nodes
dcs = nx.bipartite.degree_centrality(G,users)
for n in G.nodes():
    G.node[n]['bipartite_dcs']= dcs[n]

In [None]:
## Function to add cosine similarity of users to Graph

def add_cosine_similarity(G,proj_G,row_order,column_order=None):
    
    ## Create a biadjacency matrix using graph, row order(list of nodes), and column_order (list of nodes)
    biadjacency_matrix = nx.bipartite.biadjacency_matrix(G,row_order=row_order,column_order=column_order)
    
    ## Convert biadjacency_matrix to a data frame
    df = pd.DataFrame(biadjacency_matrix.toarray(),index=row_order,columns=column_order)

In [None]:
## Select a user and get the cosine similarity score between selected user and all user edges
from sklearn.metrics.pairwise import cosine_similarity
usr1 = 558
for i,usr2 in enumerate(G_user[usr1]):
    G_user[usr1][usr2]['cosine_similarity'] = cosine_similarity(df.loc[usr1].values.reshape(1,-1),df.loc[usr2].values.reshape(1,-1))

In [None]:
# Get a list of users and cosine score for selected user
user_cosines = np.empty((len(G_user.edges(usr1)),2),dtype=np.object)
for i,(u,v) in enumerate(G_user.edges(usr1)):
    user_cosines[i][0] = v
    user_cosines[i][1] = G_user.get_edge_data(u,v)['cosine_similarity'][0][0]

#Top 10 cosine Similarities
top_10_users = user_cosines[user_cosines[:,1].argsort()][-20:,0]
top_10_users

In [None]:
bks_dif = defaultdict(lambda: defaultdict(float))
for usr in top_10_users:
    for bk in set(G.neighbors(usr)).difference(set(G.neighbors(usr1))):
        bks_dif[bk]['count'] +=1
        bks_dif[bk]['implicit'] += 1 if G[usr][bk]['rating'] == 0 else 0
        bks_dif[bk]['explicit'] += 1 if G[usr][bk]['rating'] > 0 else 0
        bks_dif[bk]['rating'] += G[usr][bk]['rating']
        bks_dif[bk]['avg_rating'] = bks_dif[bk]['rating']/bks_dif[bk]['count']
        try:
            bks_dif[bk]['avg_explicit_rating'] = bks_dif[bk]['rating']/bks_dif[bk]['explicit']
        except:
            bks_dif[bk]['avg_explicit_rating'] = 0

pd.DataFrame(bks_dif).T.sort_values('count',ascending=False).join(books_df.set_index('ISBN')).iloc[:20].sort_values('avg_rating',ascending=False)