## Amazon Book Recommendation

#### Table of Contents

* Introduction
* Data Imported and Formatting
* Data Exploration
* Graph Processing and Analysis

### Imports

In [32]:
import cudf
import cugraph
import numpy as np

import pandas as pd
import string
import nvstrings, nvtext
try:
    import nltk
except ModuleNotFoundError:
    os.system('pip install nltk')
    import nltk
    nltk.download('stopwords')

from collections import OrderedDict

### Introduction

Dataset is the processed version of Amazon Product co-purchasing network metadata taken from SNAP http://snap.stanford.edu/data/amazon-meta.html. 
The original dataset includes about 548,552 different products (Books, music CDs, DVDs, and VHS video tapes)
The dataset used below includes only Book.

### Load and Explore Dataset

In [33]:
dataset_path = '../data/amazon/books/amazon-books.csv'

In [34]:
%%time
gdf = cudf.DataFrame()
gdf = cudf.read_csv(dataset_path)

CPU times: user 128 ms, sys: 132 ms, total: 261 ms
Wall time: 264 ms


In [35]:
%%time
gdf.head().to_pandas()

CPU times: user 22 ms, sys: 779 µs, total: 22.8 ms
Wall time: 21.5 ms


Unnamed: 0,Id,ASIN,Title,Categories,Group,Copurchased,SalesRank,TotalReviews,AvgRating
0,1,827229534,Patterns of Preaching: A Sermon Sampler,subjects religion preaching clergy spiritualit...,Book,0804215715 156101074X 0687023955 0687074231 08...,396585,2,2.0
1,2,738700797,Candlemas: Feast of Flames,subjects witchcraft earth religion based spiri...,Book,0738700827 1567184960 1567182836 0738700525 07...,168596,12,12.0
2,3,486287785,World War II Allied Fighter Planes Trading Cards,general subjects hobbies home garden crafts books,Book,,1270652,1,1.0
3,4,842328327,Life Application Bible Commentary: 1 and 2 Tim...,subjects life bibles christian general history...,Book,0842328130 0842330313 0842328610 0842328572,631289,1,1.0
4,5,1577943082,Prayers That Avail Much for Business: Executive,subjects religion prayerbooks devotion worship...,Book,157794349X 0892749504 1577941829 0892749563,455160,0,0.0


#### Explore Books

Dataset contains 393535 book titles.

In [36]:
%%time
gdf.shape

CPU times: user 8 µs, sys: 4 µs, total: 12 µs
Wall time: 16.2 µs


(392966, 9)

In [37]:
%%time
gdf.ASIN.unique().shape[0]

CPU times: user 4.6 ms, sys: 24.3 ms, total: 28.9 ms
Wall time: 27.6 ms


392966

In [38]:
%%time
query = gdf[gdf.ASIN == "1577943082"]
print(query)

   Id        ASIN                                            Title                                                                                                Categories  Group                                  Copurchased  SalesRank ...  AvgRating
4   5  1577943082  Prayers That Avail Much for Business: Executive  subjects religion prayerbooks devotion worship business spirituality living books christianity christian   Book  157794349X 0892749504 1577941829 0892749563     455160 ...        0.0
[1 more columns]
CPU times: user 166 ms, sys: 125 ms, total: 291 ms
Wall time: 296 ms


#### Preprocessing

In [39]:
# convert cudf to pandas since the APIs needed to split (explode) a column into multiple rows is not supported yet
pd_df = gdf.to_pandas()

In [40]:
pd_df.Copurchased = pd_df.Copurchased.fillna('').astype(str)

In [41]:
pd_df.head(6)

Unnamed: 0,Id,ASIN,Title,Categories,Group,Copurchased,SalesRank,TotalReviews,AvgRating
0,1,827229534,Patterns of Preaching: A Sermon Sampler,subjects religion preaching clergy spiritualit...,Book,0804215715 156101074X 0687023955 0687074231 08...,396585,2,2.0
1,2,738700797,Candlemas: Feast of Flames,subjects witchcraft earth religion based spiri...,Book,0738700827 1567184960 1567182836 0738700525 07...,168596,12,12.0
2,3,486287785,World War II Allied Fighter Planes Trading Cards,general subjects hobbies home garden crafts books,Book,,1270652,1,1.0
3,4,842328327,Life Application Bible Commentary: 1 and 2 Tim...,subjects life bibles christian general history...,Book,0842328130 0842330313 0842328610 0842328572,631289,1,1.0
4,5,1577943082,Prayers That Avail Much for Business: Executive,subjects religion prayerbooks devotion worship...,Book,157794349X 0892749504 1577941829 0892749563,455160,0,0.0
5,6,486220125,How the Other Half Lives: Studies Among the Te...,general social subjects history jewish nonfict...,Book,0486401960 0452283612 0486229076 0714840343,188784,17,17.0


In [42]:
# create new dataframe from the series with ASIN as the index
new_pd_df = pd.DataFrame(pd_df.Copurchased.str.split(' ').tolist(), index=pd_df.ASIN).stack()

In [43]:
new_pd_df.head(6)

ASIN         
0827229534  0    0804215715
            1    156101074X
            2    0687023955
            3    0687074231
            4    082721619X
0738700797  0    0738700827
dtype: object

In [44]:
# get rid of secondary index
# make ASIN as a column (it can't be an index since the values will be duplicate)
new_pd_df = new_pd_df.reset_index([0, 'ASIN'])

In [45]:
# to save memory, select only the columns we need for our graph
# ASIN will become our nodes
new_pd_df.columns = ['ASIN', 'Copurchase_ASIN']

In [46]:
new_pd_df.head(10)

Unnamed: 0,ASIN,Copurchase_ASIN
0,827229534,0804215715
1,827229534,156101074X
2,827229534,0687023955
3,827229534,0687074231
4,827229534,082721619X
5,738700797,0738700827
6,738700797,1567184960
7,738700797,1567182836
8,738700797,0738700525
9,738700797,0738700940


In [47]:
%%time
sorted_pd_df = new_pd_df.sort_values(by=['ASIN'])

CPU times: user 1.59 s, sys: 1.26 ms, total: 1.59 s
Wall time: 1.59 s


In [48]:
sorted_pd_df

Unnamed: 0,ASIN,Copurchase_ASIN
700446,0000037931,
511570,0001047655,0061007358
511569,0001047655,0061007129
511571,0001047655,0061007137
511572,0001047655,0061099341
511573,0001047655,0061007161
886962,0001053388,
596758,0001053736,0345336062
596759,0001053736,0140380531
596757,0001053736,0440905605


Construct Book Graph

In [49]:
%%time
graph_gdf = cudf.from_pandas(sorted_pd_df)

CPU times: user 304 ms, sys: 51.9 ms, total: 356 ms
Wall time: 355 ms


In [50]:
graph_gdf

<cudf.DataFrame ncols=2 nrows=1037401 >

In [51]:
graph_gdf.dtypes

ASIN               object
Copurchase_ASIN    object
dtype: object

#### Graph Processing

convert to edge list format

In [108]:
#graph_gdf.ASIN = graph_gdf.ASIN.astype('int32')
#graph_gdf.Copurchase_ASIN = graph_gdf.Copurchase_ASIN.astype('int32')

In [109]:
graph_gdf.ASIN.shape[0]

1037401

In [110]:
graph_gdf.Copurchase_ASIN.shape[0]

1037401

In [129]:
#src_edges = cudf.concat([graph_gdf.ASIN]).reset_index()['ASIN']
#dest_edges = cudf.concat([graph_gdf.Copurchase_ASIN]).reset_index()['Copurchase_ASIN']
#weights = cudf.concat([cudf.Series(np.ones(graph_gdf.ASIN.shape[0]))])


In [130]:
#new_graph_gdf = cudf.DataFrame()
#new_graph_gdf["src"] = src_edges
#new_graph_gdf["dest"] = dest_edges
#new_graph_gdf["weights"] = weights

#new_graph_gdf.head(10).to_pandas()

In [131]:
%%time
graph_gdf["src_id"], graph_gdf["dest_id"], numbering = cugraph.Graph().renumber(graph_gdf.ASIN, graph_gdf.Copurchase_ASIN)

CPU times: user 5.42 ms, sys: 4.48 ms, total: 9.9 ms
Wall time: 9.76 ms


In [132]:
graph_gdf.to_pandas().sort_values(by=['src_id'])

Unnamed: 0,ASIN,Copurchase_ASIN,src_id,dest_id
208371,007011496X,,1,0
234217,0060621591,0520222288,2,175960
234216,0060621591,0060556102,2,1768
300315,0062505521,1586210467,3,332369
300316,0062505521,0451527534,3,328301
300314,0062505521,0446676500,3,234316
300312,0062505521,0800634497,3,249409
300313,0062505521,0060646918,3,35350
1026993,076790625X,0743212754,4,111887
1026994,076790625X,0385231261,4,15976


Create a Graph

In [134]:
graph = cugraph.Graph()
graph.add_edge_list(graph_gdf["src_id"], graph_gdf["dest_id"])

In [135]:
graph.degree()

<cudf.DataFrame ncols=2 nrows=392970 >

In [136]:
graph.number_of_vertices()

392970

In [152]:
# Call cugraph.pagerank to get the pagerank scores
cu_count = cugraph.triangles(graph)

In [153]:
cu_count

610705

In [137]:
%time jac_df = cugraph.jaccard(graph)

CPU times: user 8.43 ms, sys: 4.16 ms, total: 12.6 ms
Wall time: 11.4 ms


In [138]:
jac_df.sort_values("source", ascending=False).head(25).to_pandas()

Unnamed: 0,source,destination,jaccard_coeff
1037400,392969,0,0.0
1037399,392968,0,0.0
1037395,392967,11359,0.166667
1037396,392967,23190,0.0
1037397,392967,354285,0.166667
1037398,392967,392952,0.0
1037394,392966,36354,0.0
1037393,392965,0,0.0
1037392,392964,0,0.0
1037389,392963,148036,0.0


In [140]:
jac_df.sort_values("jaccard_coeff", ascending=False).head(25).to_pandas()

Unnamed: 0,source,destination,jaccard_coeff
936,364,1105,0.8
937,364,40906,0.8
938,364,41662,0.8
939,364,159142,0.8
940,364,160327,0.8
1689,650,228009,0.8
1879,725,48138,0.8
1880,725,75056,0.8
1882,725,241792,0.8
1980,763,105414,0.8


In [87]:
# define a function for printing the top most similar vertices
def print_most_similar_jaccard(df):
    
    jmax = df['jaccard_coeff'].max()
    dm = df.query('jaccard_coeff >= @jmax')    
    
    #find the best
    for i in range(len(dm)):    
        print("Vertices " + str(dm['source'][i]) + " and " + 
              str(dm['destination'][i]) + " are most similar with score: " 
              + str(dm['jaccard_coeff'][i]))
    del jmax
    del dm

# define a function for printing the top most similar vertices
def print_most_similar_overlap(df):
    
    smax = df['overlap_coeff'].max()
    dm = df.query('overlap_coeff >= @smax')      
    
    for i in range(len(dm)):
        print("Vertices " + str(dm['source'][i]) + " and " + 
          str(dm['destination'][i]) + " are most similar with score: " 
          + str(dm['overlap_coeff'][i]))
        
    del smax
    del dm


In [None]:
print_most_similar_jaccard(jac_df)

In [146]:
# numbering map that maps the new source ids to the original ASIN.
ASIN = numbering[1480]
ASIN

914525182

In [151]:
print(graph_gdf[graph_gdf.source==364])

AttributeError: 'DataFrame' object has no attribute 'source'