# Book Recommendations: Content-Based System

In [1]:
import pandas as pd
import numpy as np
import re
import string

from rake_nltk import Rake

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

from tabulate import tabulate

## Data

In [2]:
df = pd.read_csv('data/books_content.csv')
df.head(2)

Unnamed: 0,Id,Name,PublishYear,Authors,PagesNumber,Description
0,1000014,Haroun And The Sea Of Stories,1991,Salman Rushdie,219.0,The author of The Satanic Verses returns with ...
1,1000030,Anne Of Green Gables,2003,L.M. Montgomery,0.0,When Marilla and Matthew Cuthbert of Green Gab...


In [3]:
df = df[['Name', 'Authors', 'Description']]
df.head(2)

Unnamed: 0,Name,Authors,Description
0,Haroun And The Sea Of Stories,Salman Rushdie,The author of The Satanic Verses returns with ...
1,Anne Of Green Gables,L.M. Montgomery,When Marilla and Matthew Cuthbert of Green Gab...


## Clean Author

In [4]:
# lower case
df.loc[:,'Authors'] = df.loc[:,'Authors'].str.lower()

In [5]:
# remove punctuation in author's names
df.loc[:,'Authors'] = df.loc[:,'Authors'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [6]:
# remove spaces in author's names
df.loc[:,'Authors'] = df.loc[:,'Authors'].apply(lambda x: x.replace(' ', ''))

In [7]:
df.head(2)

Unnamed: 0,Name,Authors,Description
0,Haroun And The Sea Of Stories,salmanrushdie,The author of The Satanic Verses returns with ...
1,Anne Of Green Gables,lmmontgomery,When Marilla and Matthew Cuthbert of Green Gab...


## Clean Description Text

In [8]:
# copy of description
df.loc[:,'plot'] = df['Description']

In [9]:
# remove html tags
def remove_html(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

df.loc[:,'plot'] = df.loc[:,'plot'].apply(lambda x: str(remove_html(x)))

In [10]:
# remove punctuation
df.loc[:,'plot'] = df.loc[:,'plot'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [11]:
# create new column key_words
df['key_words'] = ""

for index, row in df.iterrows():
    plot = row['plot']
    
    # initate Rake
    # removes english stopwords, punctuation 
    r = Rake()

    # extract keywords by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary with key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column for the corresponding movie
    row['key_words'] = list(key_words_dict_scores.keys())

In [12]:
df.drop(columns=['Description', 'plot'],inplace=True)

In [13]:
df.set_index('Name', inplace=True)
df.head(2)

Unnamed: 0_level_0,Authors,key_words
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Haroun And The Sea Of Stories,salmanrushdie,"[forgotten, story, city, print, humorous, sata..."
Anne Of Green Gables,lmmontgomery,"[vanilla, cake, memorable, adventure, tragicom..."


In [14]:
df['all_words'] = ''

columns=df.columns

for index, row in df.iterrows():
    words=''
    for col in columns:
        # if this isn't done author name will be split into letters
        # join words in column with a space
        if col != 'Authors':
            words = words + ' '.join(row[col]) + ' '
        else:
            words = words + row[col] + ' '
    # add words to new column
    row['all_words'] = words

In [15]:
df.head()

Unnamed: 0_level_0,Authors,key_words,all_words
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Haroun And The Sea Of Stories,salmanrushdie,"[forgotten, story, city, print, humorous, sata...",salmanrushdie forgotten story city print humor...
Anne Of Green Gables,lmmontgomery,"[vanilla, cake, memorable, adventure, tragicom...",lmmontgomery vanilla cake memorable adventure ...
"Jackie & Me (A Baseball Card Adventure, #2)",dangutman,"[like, every, meet, one, african, american, wh...",dangutman like every meet one african american...
Shenzhen: A Travelogue From China,guydelisle,"[north, koreashenzhen, simple, freedoms, easte...",guydelisle north koreashenzhen simple freedoms...
Cold Fire,deankoontz,"[killed, plane, crash, terrifying, monsters, c...",deankoontz killed plane crash terrifying monst...


In [16]:
df.drop(columns=['Authors', 'key_words'], inplace=True)

In [17]:
df.head()

Unnamed: 0_level_0,all_words
Name,Unnamed: 1_level_1
Haroun And The Sea Of Stories,salmanrushdie forgotten story city print humor...
Anne Of Green Gables,lmmontgomery vanilla cake memorable adventure ...
"Jackie & Me (A Baseball Card Adventure, #2)",dangutman like every meet one african american...
Shenzhen: A Travelogue From China,guydelisle north koreashenzhen simple freedoms...
Cold Fire,deankoontz killed plane crash terrifying monst...


## Count Vectorizer

In [18]:
cv = CountVectorizer()

cv_matrix = cv.fit_transform(df['all_words'])

In [19]:
cv_matrix.shape

(23979, 112407)

In [20]:
indices = pd.Series(df.index)
indices[:10]

0                        Haroun And The Sea Of Stories
1                                 Anne Of Green Gables
2          Jackie & Me (A Baseball Card Adventure, #2)
3                    Shenzhen: A Travelogue From China
4                                            Cold Fire
5                                             Whispers
6                                 The Door To December
7    My Secret War: The World War Ii Diary Of Madel...
8    The Power Of One (Young Readers Condensed Edit...
9    Healthy Sleep Habits, Happy Child: A Step-By-S...
Name: Name, dtype: object

## Cosine Similarity

In [21]:
cosine_sim = cosine_similarity(cv_matrix, cv_matrix)

In [22]:
cv_matrix

<23979x112407 sparse matrix of type '<class 'numpy.int64'>'
	with 1534682 stored elements in Compressed Sparse Row format>

## Content Based Book Recommender System

In [49]:
def recommend_books(title, cosine_sim = cosine_sim):
    
    """This function recommends books based on the author and descripton of a book.
    It takes the input of a Title. If the title does not have the first letter of each word capitalized it will not 
    be able to recommend a book.
    It returns the top 10 books the system recommends."""
    
    try:
        
        # initializing the empty list of recommended books
        books = []

        # getting the index of the book that matches the title
        idx = indices[indices == title].index[0]

        # sort the similarity scores in descending order
        scores = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

        # getting the indexes of the top 10 
        top_10_index = list(scores.iloc[1:11].index)

        # append the books to the books list
        for i in top_10_index:
            books.append(list(df.index)[i])
        
        # create dataframe of recommended books
        titles = pd.DataFrame(books, columns=['Title'])
        titles.insert(0, 'Rank', range(1, 1 + len(titles)))
        
        print('The following books are recommended as a similiar book to "{}":'.format(title))
        print("")
        titles.to_csv('results/content_book_recs_{}.csv'.format(title[:10]), index=False)
        print(tabulate(titles, headers='keys', tablefmt='psql', showindex=False))
        
    except:
        print('Unable to gather recommendations for the entered title. \nPlease try entering a different title. \nPlease make sure the first letter of each word is capitalized.')  
    

In [50]:
recommend_books('Something Blue')

The following books are recommended as a similiar book to "Something Blue":

+--------+-----------------------------------------+
|   Rank | Title                                   |
|--------+-----------------------------------------|
|      1 | Love The One You'Re With                |
|      2 | Something Borrowed (Darcy & Rachel, #1) |
|      3 | Someone Like You                        |
|      4 | The Dive From Clausen'S Pier            |
|      5 | My Sweet Audrina                        |
|      6 | Baby Proof                              |
|      7 | Just As Long As We'Re Together          |
|      8 | Magic For Marigold                      |
|      9 | Tempted (Alex Kennedy, #1)              |
|     10 | A Whole New Light                       |
+--------+-----------------------------------------+


In [51]:
recommend_books('Something blue')

Unable to gather recommendations for the entered title. 
Please try entering a different title. 
Please make sure the first letter of each word is capitalized.


In [52]:
recommend_books('Jemima J')

The following books are recommended as a similiar book to "Jemima J":

+--------+-----------------------------------------------------+
|   Rank | Title                                               |
|--------+-----------------------------------------------------|
|      1 | Awol On The Appalachian Trail                       |
|      2 | Something Blue                                      |
|      3 | Pack Challenge (Magnus Pack, #1)                    |
|      4 | Once A Cowboy (The Cowboys, #3)                     |
|      5 | April Lady                                          |
|      6 | Wherever Nina Lies                                  |
|      7 | The Valley Of The Wolves (Crónicas De La Torre, #1) |
|      8 | The Country Girls                                   |
|      9 | Beauty And The Beast                                |
|     10 | Moonlight (Dark Guardian, #1)                       |
+--------+-----------------------------------------------------+


In [53]:
recommend_books('The Walking Dead, Vol. 7: The Calm Before')

The following books are recommended as a similiar book to "The Walking Dead, Vol. 7: The Calm Before":

+--------+----------------------------------------------------+
|   Rank | Title                                              |
|--------+----------------------------------------------------|
|      1 | The Walking Dead, Vol. 8: Made To Suffer           |
|      2 | The Walking Dead, Vol. 03: Safety Behind Bars      |
|      3 | The Walking Dead, Volume 1: Days Gone Bye          |
|      4 | Alas, Babylon                                      |
|      5 | Cities Of The Red Night                            |
|      6 | Timeline                                           |
|      7 | The Power Of One (Young Readers Condensed Edition) |
|      8 | Gone To Soldiers                                   |
|      9 | Have A Nice Day: A Tale Of Blood And Sweatsocks    |
|     10 | Centaur Aisle (Xanth, #4)                          |
+--------+----------------------------------------------------+
