In [1]:
import numpy as np
import pandas as pd

import os
print(os.listdir('./data'))

['books.csv', 'book_tags.csv', 'ratings.csv', 'sample_book.xml', 'tags.csv', 'to_read.csv']


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

#### Load the data from csv files

In [3]:
#전체 책 데이터가 들어가있는 books
books = pd.read_csv('./data/books.csv', encoding='ISO-8859-1')
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [4]:
books.shape

(10000, 23)

In [5]:
books.columns

Index(['book_id', 'goodreads_book_id', 'best_book_id', 'work_id',
       'books_count', 'isbn', 'isbn13', 'authors', 'original_publication_year',
       'original_title', 'title', 'language_code', 'average_rating',
       'ratings_count', 'work_ratings_count', 'work_text_reviews_count',
       'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
       'image_url', 'small_image_url'],
      dtype='object')

In [6]:
#유저가 원하는 책에 준 평점
ratings = pd.read_csv('./data/ratings.csv', encoding='ISO-8859-1')
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [7]:
#goodreads_book_id와 book_id의 차이점을 모르겠음
book_tags = pd.read_csv('./data/book_tags.csv')
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [8]:
#책 태그,종류 분류
tags = pd.read_csv('./data/tags.csv')
tags.tail()

Unnamed: 0,tag_id,tag_name
34247,34247,Ｃhildrens
34248,34248,Ｆａｖｏｒｉｔｅｓ
34249,34249,Ｍａｎｇａ
34250,34250,ＳＥＲＩＥＳ
34251,34251,ｆａｖｏｕｒｉｔｅｓ


In [9]:
#goodreads_book에 있는 tag_id를 tag_name과 연결시켜줌
tags_join_DF = pd.merge(book_tags, tags, left_on='tag_id', right_on='tag_id',
how='inner')
tags_join_DF.tail(10)

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
999902,30364931,21304,7,neighbour
999903,30839185,14049,9,hate-love
999904,31140847,5499,13,boss-employee
999905,31140847,32776,12,work-romance
999906,31194270,17222,3,kindle-books-to-read
999907,31538635,14690,6,hogwarts
999908,32848471,16149,21,jan-2017
999909,33288638,27821,9,single-mom
999910,33288638,11478,7,fave-author
999911,33288638,27939,7,slowburn


In [10]:
to_read = pd.read_csv('./data/to_read.csv')
to_read.head()

Unnamed: 0,user_id,book_id
0,9,8
1,15,398
2,15,275
3,37,7173
4,34,380


**TfidfVectorizer** function from scikit-learn, which transforms text to feature vectors that can be used as input to estimator.

**Cosine Similarity** to calculate a numeric value that denotes the similarity between two books.

In [40]:
#TfidfVectorizer 생성 후 작가 백터라이즈 적용
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0 
, stop_words='english')
tfidf_matrix = tf.fit_transform(books['authors'])
tfidf_matrix

<812x2289 sparse matrix of type '<class 'numpy.float64'>'
	with 3286 stored elements in Compressed Sparse Row format>

In [12]:
#linear_kernel을 통해서 작가 word 사이에서 유사도 구하기
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [13]:
titles = books['title']
indices = pd.Series(books.index, index = books['title'])

indices


title
The Hunger Games (The Hunger Games, #1)                                                         0
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)                                        1
Twilight (Twilight, #1)                                                                         2
To Kill a Mockingbird                                                                           3
The Great Gatsby                                                                                4
                                                                                             ... 
Bayou Moon (The Edge, #2)                                                                    9995
Means of Ascent (The Years of Lyndon Johnson, #2)                                            9996
The Mauritius Command                                                                        9997
Cinderella Ate My Daughter: Dispatches from the Frontlines of the New Girlie-Girl Culture    9998
The First Worl

In [14]:
#유사도가 높은 작가 함수 생성
def authors_recommendation(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

In [15]:
authors_recommendation('The Hobbit').head(20)

18      The Fellowship of the Ring (The Lord of the Ri...
154            The Two Towers (The Lord of the Rings, #2)
160     The Return of the King (The Lord of the Rings,...
188     The Lord of the Rings (The Lord of the Rings, ...
963     J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
4975        Unfinished Tales of NÃºmenor and Middle-Earth
2308                               The Children of HÃºrin
610              The Silmarillion (Middle-Earth Universe)
8271                   The Complete Guide to Middle-Earth
1128     The History of the Hobbit, Part One: Mr. Baggins
465                             The Hobbit: Graphic Novel
0                 The Hunger Games (The Hunger Games, #1)
1       Harry Potter and the Sorcerer's Stone (Harry P...
2                                 Twilight (Twilight, #1)
3                                   To Kill a Mockingbird
4                                        The Great Gatsby
5                                  The Fault in Our Stars
7             

Recommend books using the tags provided to the books.

In [29]:
books_with_tags = pd.merge(books, tags_join_DF, left_on='book_id', right_on='goodreads_book_id',
how='inner')
books_with_tags

Unnamed: 0,book_id,goodreads_book_id_x,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,goodreads_book_id_y,tag_id,count,tag_name
0,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,1,30574,167697,to-read
1,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,1,11305,37174,fantasy
2,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,1,11557,34173,favorites
3,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,1,8717,12986,currently-reading
4,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,1,33114,12716,young-adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81195,9998,77431,77431,2393986,60,039330762X,9.780393e+12,Patrick O'Brian,1977.0,The Mauritius Command,...,111,1191,4240,5180,https://images.gr-assets.com/books/1455373531m...,https://images.gr-assets.com/books/1455373531s...,9998,21601,5,nippon
81196,9998,77431,77431,2393986,60,039330762X,9.780393e+12,Patrick O'Brian,1977.0,The Mauritius Command,...,111,1191,4240,5180,https://images.gr-assets.com/books/1455373531m...,https://images.gr-assets.com/books/1455373531s...,9998,34241,5,日本語
81197,9998,77431,77431,2393986,60,039330762X,9.780393e+12,Patrick O'Brian,1977.0,The Mauritius Command,...,111,1191,4240,5180,https://images.gr-assets.com/books/1455373531m...,https://images.gr-assets.com/books/1455373531s...,9998,16220,5,japanese-novels
81198,9998,77431,77431,2393986,60,039330762X,9.780393e+12,Patrick O'Brian,1977.0,The Mauritius Command,...,111,1191,4240,5180,https://images.gr-assets.com/books/1455373531m...,https://images.gr-assets.com/books/1455373531s...,9998,11793,4,fiction-japan


In [20]:
tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')
tfidf_matrix1 = tf1.fit_transform(books_with_tags['tag_name'].head(10000))
cosine_sim1 = linear_kernel(tfidf_matrix1, tfidf_matrix1)

In [22]:
cosine_sim1

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [27]:
title1 = books['title']
indices1 = pd.Series(books.index, index=books['title'])

def tags_recommendations(title):
    idx = indices1[title]
    sim_scores = list(enumerate(cosine_sim1[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

In [28]:
tags_recommendations('The Hobbit').head(20)


106                                    A Walk to Remember
206                One for the Money (Stephanie Plum, #1)
306     The Wise Man's Fear (The Kingkiller Chronicle,...
404                                Breakfast of Champions
506     The Hunger Games Trilogy Boxset (The Hunger Ga...
606     City of Heavenly Fire (The Mortal Instruments,...
2805                              The Rules of Attraction
54                                        Brave New World
136                             Outlander (Outlander, #1)
255     Three Cups of Tea: One Man's Mission to Promot...
354                       Graceling (Graceling Realm, #1)
449                   Storm Front (The Dresden Files, #1)
542                  Last Sacrifice (Vampire Academy, #6)
647               Inheritance (The Inheritance Cycle, #4)
571                        Oryx and Crake (MaddAddam, #1)
680      Little House in the Big Woods (Little House, #1)
99                                   The Poisonwood Bible
168           

In [30]:
temp_df = books_with_tags.groupby('book_id')['tag_name'].apply(" ".join).reset_index()
temp_df.head()

Unnamed: 0,book_id,tag_name
0,1,to-read fantasy favorites currently-reading yo...
1,2,to-read fantasy favorites currently-reading yo...
2,3,to-read fantasy favorites currently-reading yo...
3,5,to-read fantasy favorites currently-reading yo...
4,6,to-read fantasy young-adult fiction harry-pott...


In [31]:
books = pd.merge(books, temp_df, left_on='book_id', right_on='book_id', how='inner')

In [32]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,tag_name
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,to-read fantasy favorites currently-reading yo...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,to-read fantasy favorites currently-reading yo...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,to-read fantasy favorites currently-reading yo...
3,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,to-read fantasy favorites currently-reading yo...
4,6,11870085,11870085,16827462,226,525478817,9780525000000.0,John Green,2012.0,The Fault in Our Stars,...,2478609,140739,47994,92723,327550,698471,1311871,https://images.gr-assets.com/books/1360206420m...,https://images.gr-assets.com/books/1360206420s...,to-read fantasy young-adult fiction harry-pott...


In [33]:
books['corpus'] = (pd.Series(books[['authors', 'tag_name']]
                    .fillna('')
                    .values.tolist()
                    ).str.join(' '))

In [35]:
tf_corpus = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')
tfidf_matrix_corpus = tf_corpus.fit_transform(books['corpus'])
cosine_sim_corpus = linear_kernel(tfidf_matrix_corpus, tfidf_matrix_corpus)

titles = books['title']
indices = pd.Series(books.index, index=books['title'])

def corpus_recommendations(title):
    idx = indices1[title]
    sim_scores = list(enumerate(cosine_sim_corpus[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

corpus_recommendations('The Hobbit')

0                The Hunger Games (The Hunger Games, #1)
2                                Twilight (Twilight, #1)
3                                       The Great Gatsby
4                                 The Fault in Our Stars
5                                 The Catcher in the Rye
1      Harry Potter and the Sorcerer's Stone (Harry P...
159                              The Death of Ivan Ilych
683                     Visions in Death (In Death, #19)
680                  Visions of Heat (Psy-Changeling #2)
524                           Private Games (Private #3)
681                    The Hydrogen Sonata (Culture #10)
682                      Secret History (Mistborn, #3.5)
48                                          The Namesake
679                     Cold Fire (The Circle Opens, #3)
755                                                 Adam
677                                          Born to Run
678                              Primates of Park Avenue
507                            

In [36]:
corpus_recommendations("Twilight (Twilight, #1)")

3                                       The Great Gatsby
0                The Hunger Games (The Hunger Games, #1)
4                                 The Fault in Our Stars
6                                    Pride and Prejudice
1      Harry Potter and the Sorcerer's Stone (Harry P...
159                              The Death of Ivan Ilych
5                                 The Catcher in the Rye
524                           Private Games (Private #3)
48                                          The Namesake
528    The Nine: Inside the Secret World of the Supre...
683                     Visions in Death (In Death, #19)
507                                    Our Mutual Friend
690                          Life and Times of Michael K
679                     Cold Fire (The Circle Opens, #3)
755                                                 Adam
680                  Visions of Heat (Psy-Changeling #2)
682                      Secret History (Mistborn, #3.5)
681                    The Hydr

In [37]:
corpus_recommendations("Romeo and Juliet")

19                             The Giver (The Giver, #1)
615                                    We Need New Names
18               Fifty Shades of Grey (Fifty Shades, #1)
17                                   Memoirs of a Geisha
616         Wicca: A Guide for the Solitary Practitioner
507                                    Our Mutual Friend
16                                             Gone Girl
662                 Succubus Heat (Georgina Kincaid, #4)
465                                    Eon (The Way, #1)
207                                      Decision Points
765    The First Confessor (The Legend of Magda Searu...
204    Everyday Italian: 125 Simple and Delicious Rec...
683                     Visions in Death (In Death, #19)
7                                        The Kite Runner
2                                Twilight (Twilight, #1)
4                                 The Fault in Our Stars
3                                       The Great Gatsby
0                The Hunger Gam