In [4]:
import tarfile
import ast
import pandas as pd

### Load data

In [5]:
with tarfile.open("lthing_data.tar.gz") as tar:
    print("Files in tar archive:")
    tar.list()

    with tar.extractfile("lthing_data/reviews.json") as file:
        count = 0
        for line in file:
            print(line)
            count += 1
            if count > 3:
                break

Files in tar archive:
?rwxr-xr-x julian/julian          0 2016-10-01 00:58:55 lthing_data/ 
?rw-r--r-- julian/julian    4824989 2014-01-02 20:55:12 lthing_data/edges.txt 
?rw-rw-r-- julian/julian 1604368260 2016-10-01 00:58:25 lthing_data/reviews.json 
b"{'work': '3206242', 'flags': [], 'unixtime': 1194393600, 'stars': 5.0, 'nhelpful': 0, 'time': 'Nov 7, 2007', 'comment': 'This a great book for young readers to be introduced to the world of Middle Earth. ', 'user': 'van_stef'}\n"
b"{'work': '12198649', 'flags': [], 'unixtime': 1333756800, 'stars': 5.0, 'nhelpful': 0, 'time': 'Apr 7, 2012', 'comment': 'Help Wanted: Tales of On The Job Terror from Evil Jester Press is a fun and scary read. This book is edited by Peter Giglio and has short stories by Joe McKinney, Gary Brandner, Henry Snider and many more. As if work wasnt already scary enough, this book gives you more reasons to be scared. Help Wanted is an excellent anthology that includes some great stories by some master storytellers.

In [6]:
reviews = []
with tarfile.open("lthing_data.tar.gz") as tar:
    with tar.extractfile("lthing_data/reviews.json") as file:
        for line in file:
            record = ast.literal_eval(line.decode("utf8"))
            if any(x not in record for x in ['user', 'work', 'stars']):
                continue
            reviews.append([record['user'], record['work'], record['stars']])
print(len(reviews), "records retrieved")

1387209 records retrieved


### make a matrix of how different users rate each book.

In [13]:
reviews = pd.DataFrame(reviews, columns=["user", "work", "stars"])
print(reviews.head())
print("#####")
print(reviews.info())

            user      work  stars
0       van_stef   3206242    5.0
1       dwatson2  12198649    5.0
2       amdrane2  12981302    4.0
3  Lila_Gustavus   5231009    3.0
4      skinglist    184318    2.0
#####
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1387209 entries, 0 to 1387208
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   user    1387209 non-null  object 
 1   work    1387209 non-null  object 
 2   stars   1387209 non-null  float64
dtypes: float64(1), object(2)
memory usage: 31.8+ MB
None


### Use small sample size at first
#### consider only those users who reviewed more than 50 books
#### and also those books who are reviewed by more than 50 users

In [20]:
usercount = reviews[["work","user"]].groupby("user").count()
usercount

Unnamed: 0_level_0,work
user,Unnamed: 1_level_1
,84
%C3%90ark-Angel,1
---fan,1
-AlyssaE-,41
-Carmen-,1
...,...
zzfhickling,2
zzin,9
zzpperetin,1
zzrude,3


In [18]:
usercount[usercount["work"] >= 50]

Unnamed: 0_level_0,work
user,Unnamed: 1_level_1
,84
-Eva-,602
06nwingert,370
1983mk,63
1dragones,194
...,...
zjakkelien,121
zmagic69,148
zquilts,67
zwaantje,121


In [23]:
usercount = reviews[["work","user"]].groupby("user").count() # count amount of books reviewed for each user
usercount = usercount[usercount["work"] >= 50] # filter users reviewed more than 50 books
print(usercount.head())

            work
user            
              84
-Eva-        602
06nwingert   370
1983mk        63
1dragones    194


In [25]:
# Look for the books who reviewed by more than 50 users
workcount = reviews[["work","user"]].groupby("work").count()
workcount

Unnamed: 0_level_0,user
work,Unnamed: 1_level_1
1000,9
10000,106
10000001,1
1000009,2
1000019,1
...,...
999973,1
999974,1
9999742,1
9999773,1


In [27]:
workcount = workcount[workcount["user"] >= 50]
workcount.head()

Unnamed: 0_level_0,user
work,Unnamed: 1_level_1
10000,106
10001,53
1000167,186
10001797,53
10005525,134


In [29]:
# Keep only the popular books and active users
reviews = reviews[reviews["user"].isin(usercount.index) & reviews["work"].isin(workcount.index)]
print(reviews)

                user     work  stars
0           van_stef  3206242    5.0
6            justine     3067    4.5
18           stephmo  1594925    4.0
19         Eyejaybee  2849559    5.0
35       LisaMaria_C   452949    4.5
...              ...      ...    ...
1387161     connie53     1653    4.0
1387177   BruderBane    24623    4.5
1387192  StuartAston  8282225    4.0
1387202      danielx  9759186    4.0
1387206     jclark88  8253945    3.0

[205110 rows x 3 columns]


In [33]:
# convert  top 50 work reviewers and reviewed works
reviewmatrix = reviews.pivot(index="user", columns="work", values="stars").fillna(0)
print(reviewmatrix.shape)
print(reviewmatrix)

(5593, 2898)
work        10000  10001  1000167  10001797  10005525  10007394  10007399  \
user                                                                        
              0.0    0.0      0.0       0.0       0.0       0.0       0.0   
-Eva-         0.0    0.0      0.0       0.0       0.0       0.0       0.0   
06nwingert    0.0    0.0      0.0       0.0       0.0       0.0       0.0   
1983mk        0.0    0.0      0.0       0.0       0.0       0.0       0.0   
1dragones     5.0    4.0      0.0       0.0       0.0       0.0       0.0   
...           ...    ...      ...       ...       ...       ...       ...   
zjakkelien    0.0    0.0      0.0       0.0       0.0       0.0       0.0   
zmagic69      0.0    0.0      0.0       0.0       0.0       0.0       0.0   
zquilts       0.0    0.0      0.0       0.0       0.0       0.0       0.0   
zwaantje      0.0    0.0      0.0       0.0       0.0       0.0       0.0   
zzshupinga    0.0    0.0      0.0       0.0       0.0       0.0

### Apply SVD to the matrix

In [34]:
from numpy.linalg import svd

In [35]:
matrix = reviewmatrix.values

In [37]:
# svd() returns a full singular value decomposition
u, s, vh = svd(matrix, full_matrices=False)

columns of vh: correspond to the books
base on vector space model to find which book are most similar to one we look at
try to find the book that is best match to to first column

In [47]:
import numpy as np
def cosine_similarity(v,u):
    return (v @ u)/ (np.linalg.norm(v) * np.linalg.norm(u))

highest_similarity = -np.inf
highest_sim_col = -1
for col in range(1,vh.shape[1]):
    similarity = cosine_similarity(vh[:,0], vh[:,col])
    if similarity > highest_similarity:
        highest_similarity = similarity
        highest_sim_col = col

print("Column %d is most similar to column 0" % highest_sim_col)

Column 2092 is most similar to column 0


when a user picked a book
we may show her a few other books that are similar to the one she picked based on the cosine distance as calculated above

we may use truncated SVD to reduce the dimension of matrix vh
this means we are removing several rows on vh that the corresponding singular values in s are small,
before we use it to compute the similarity.

This would likely make the prediction more accurate as those less important features of a book are removed from consideration.

in the decomposition M = U * Σ * V^T;
the rows of U are the users and columns of V^T are books
can't identify the meanings of the columns of U or rows of V^T; genre, what?
but we can use them as **features** in recommendation system