In [0]:
import numpy as np
import pandas as pd
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
rating = pd.read_csv(os.path.join("drive","My Drive","Colab Notebooks", "goodbooks-10k","ratings.csv"))
rating.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [5]:
nr_books = len(rating['book_id'].unique())
nr_users = len(rating['user_id'].unique())
print(nr_books, nr_users)

10000 53424


In [6]:
from scipy.sparse import coo_matrix

matrix = coo_matrix((rating['rating'].astype(float),
                     (rating['book_id'], rating['user_id'])))
print(matrix)

  (1, 314)	5.0
  (1, 439)	3.0
  (1, 588)	5.0
  (1, 1169)	4.0
  (1, 1185)	4.0
  (1, 2077)	4.0
  (1, 2487)	4.0
  (1, 2900)	5.0
  (1, 3662)	4.0
  (1, 3922)	5.0
  (1, 5379)	5.0
  (1, 5461)	3.0
  (1, 5885)	5.0
  (1, 6630)	5.0
  (1, 7563)	3.0
  (1, 9246)	1.0
  (1, 10140)	4.0
  (1, 10146)	5.0
  (1, 10246)	4.0
  (1, 10335)	4.0
  (1, 10610)	5.0
  (1, 10944)	5.0
  (1, 11854)	4.0
  (1, 11927)	4.0
  (1, 12471)	5.0
  :	:
  (10000, 37777)	5.0
  (10000, 38663)	4.0
  (10000, 39251)	4.0
  (10000, 39997)	4.0
  (10000, 42257)	5.0
  (10000, 42537)	4.0
  (10000, 42810)	3.0
  (10000, 43068)	3.0
  (10000, 43318)	4.0
  (10000, 43319)	5.0
  (10000, 43744)	4.0
  (10000, 44206)	4.0
  (10000, 44655)	3.0
  (10000, 44889)	4.0
  (10000, 46337)	5.0
  (10000, 47069)	4.0
  (10000, 47326)	3.0
  (10000, 47515)	4.0
  (10000, 48201)	5.0
  (10000, 48281)	4.0
  (10000, 48386)	5.0
  (10000, 49007)	4.0
  (10000, 49383)	5.0
  (10000, 50124)	5.0
  (10000, 51328)	1.0


In [7]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=2000, n_iter=10, random_state=23)
sigma = svd.fit_transform(matrix)
print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(sigma.shape)

[0.00774261 0.00342315 0.00406367 ... 0.00012635 0.00012628 0.00012621]
0.5716303993829324
(10001, 2000)


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

# Create similarities for every book
udf = pd.DataFrame(cosine_similarity(sigma))
udf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
0,1.0,0.114501,0.13433,0.107397,0.119608,0.127664,-0.009242,0.107984,0.141741,0.10572,...,0.009914,-0.071013,-0.01068,-0.027268,0.002716,-0.093198,-0.002125,-0.008321,-0.005731,0.002781
1,0.114501,1.0,0.653566,0.604738,0.555059,0.573728,0.306774,0.500669,0.476754,0.576653,...,0.018074,-0.003581,0.00466,0.000217,0.016205,-2.9e-05,0.002547,0.000703,-0.000668,0.008048
2,0.13433,0.653566,1.0,0.599797,0.714114,0.700147,0.201772,0.676989,0.593163,0.614667,...,0.010831,0.006798,-0.005394,0.000191,0.018183,0.002648,-0.002565,-0.001626,-0.004395,0.012878
3,0.107397,0.604738,0.599797,1.0,0.438341,0.434718,0.356972,0.441128,0.570992,0.44785,...,0.01698,0.002504,-0.002089,0.000337,-0.00131,0.001515,0.000996,-0.001917,-0.002802,0.008698
4,0.119608,0.555059,0.714114,0.438341,1.0,0.81768,0.144365,0.624983,0.73366,0.56085,...,0.02894,0.000149,-0.007943,-0.001194,0.036656,0.001065,-0.000311,-0.000605,0.00076,0.000696


In [9]:
# With similarities of every book, we could search similar books for book_id==968, which is Dan Brown's 'The Da Vinci Code'

associate_books = udf.iloc[968].sort_values(ascending = False)

books = pd.read_csv(os.path.join("drive","My Drive","Colab Notebooks", "goodbooks-10k","books.csv"))

list = []
for idx in associate_books.index[:100]:
    res = books.loc[books['book_id'] == idx]
    if not res.empty:
        print(res[['title', 'authors']])

                                     title    authors
25  The Da Vinci Code (Robert Langdon, #2)  Dan Brown
                        title        authors
4982  The Heart of the Matter  Graham Greene
                                      title          authors
5305  The Confusion (The Baroque Cycle, #2)  Neal Stephenson
                                                  title            authors
8914  Breaking the Spell: Religion as a Natural Phen...  Daniel C. Dennett
                      title         authors
3467  The Rapture of Canaan  Sheri Reynolds
                                                 title  \
892  Fullmetal Alchemist, Vol. 1 (Fullmetal Alchemi...   

                            authors  
892  Hiromu Arakawa, Akira Watanabe  
            title      authors
496  Black Beauty  Anna Sewell
             title        authors
6726  Burmese Days  George Orwell
                                                  title       authors
1223  The Elegant Universe: Superstrings, Hidden 