<a href="https://colab.research.google.com/github/Sahil-Chhabra-09/Book-Recommendation-System/blob/main/Books_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Books Recommendation System based on Cosine Similarity

## Getting the data

In [192]:
import pandas as pd
data = pd.read_csv("/content/drive/MyDrive/Book Recommendation System/BooksData.xlsx")

## Cleaning and Analyzing data

In [193]:
print(data.shape)
data.head()

(16559, 7)


Unnamed: 0.1,Unnamed: 0,Index,ID,BookTitle,Author,Genre,Summary
0,0,1,620,Animal Farm,George Orwell,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,1,2,843,A Clockwork Orange,Anthony Burgess,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,2,3,986,The Plague,Albert Camus,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...
3,3,4,1756,An Enquiry Concerning Human Understanding,David Hume,,The argument of the Enquiry proceeds by a ser...
4,4,5,2080,A Fire Upon the Deep,Vernor Vinge,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...


In [194]:
# counting null or empty values

data.isna().sum()

Unnamed: 0       0
Index            0
ID               0
BookTitle        0
Author        2382
Genre         3718
Summary          0
dtype: int64

In [195]:
# dropping columns with null values in Author or Genre

data = data.drop(data[data['Genre'].isna()].index)
data = data.drop(data[data['Author'].isna()].index)

In [196]:
data.isna().sum()

Unnamed: 0    0
Index         0
ID            0
BookTitle     0
Author        0
Genre         0
Summary       0
dtype: int64

In [197]:
data.shape

(12055, 7)

In [198]:
print(data.Genre)
data.Genre[0]   #Genre was in the form of a json object

0        {"/m/016lj8": "Roman \u00e0 clef", "/m/06nbt":...
1        {"/m/06n90": "Science Fiction", "/m/0l67h": "N...
2        {"/m/02m4t": "Existentialism", "/m/02xlf": "Fi...
4        {"/m/03lrw": "Hard science fiction", "/m/06n90...
5        {"/m/098tmk": "War novel", "/m/016lj8": "Roman...
                               ...                        
16549                      {"/m/06n90": "Science Fiction"}
16551    {"/m/01jfsb": "Thriller", "/m/02xlf": "Fiction...
16555     {"/m/01jfsb": "Thriller", "/m/02xlf": "Fiction"}
16556                         {"/m/0xdf": "Autobiography"}
16558    {"/m/02ql9": "Epistolary novel", "/m/014dfn": ...
Name: Genre, Length: 12055, dtype: object


'{"/m/016lj8": "Roman \\u00e0 clef", "/m/06nbt": "Satire", "/m/0dwly": "Children\'s literature", "/m/014dfn": "Speculative fiction", "/m/02xlf": "Fiction"}'

In [199]:
import json
genres_cleaned = []
for i in data['Genre']:
    genres_cleaned.append(list(json.loads(i).values()))   #converts json string to dictionary which is further converted into list
genres_cleaned[0:10]

[['Roman à clef',
  'Satire',
  "Children's literature",
  'Speculative fiction',
  'Fiction'],
 ['Science Fiction',
  'Novella',
  'Speculative fiction',
  'Utopian and dystopian fiction',
  'Satire',
  'Fiction'],
 ['Existentialism', 'Fiction', 'Absurdist fiction', 'Novel'],
 ['Hard science fiction',
  'Science Fiction',
  'Speculative fiction',
  'Fantasy',
  'Fiction'],
 ['War novel', 'Roman à clef'],
 ["Children's literature",
  'Fantasy',
  'Speculative fiction',
  'Bildungsroman',
  'Fiction'],
 ['Science Fiction', 'Speculative fiction'],
 ['Science Fiction', 'Speculative fiction'],
 ['Speculative fiction', 'Fiction', 'Novel'],
 ['Science Fiction',
  'Speculative fiction',
  "Children's literature",
  'Fiction']]

In [200]:
# Generating another columns in data containing cleaned genre
data['Genre'] = genres_cleaned

In [201]:
data.Summary[0][:200]

" Old Major, the old boar on the Manor Farm, calls the animals on the farm for a meeting, where he compares the humans to parasites and teaches the animals a revolutionary song, 'Beasts of England'. Wh"

## Preprocessing Text
  * removing some punctuatins, converting everything to lowercase
  * removing stopwords
  * stemming words

In [202]:
# Preprocessing Data by removing some punctuations and converting everything to lower case

import re   #regular expression matching
def clean_summary(text):
    text = re.sub("\'", "", text)
    text = re.sub("[^a-zA-Z]"," ",text)     #means any character that IS NOT a-z OR A-Z
    text = ' '.join(text.split())
    text = text.lower()
    return text

data['Summary'] = data['Summary'].apply(lambda x: clean_summary(x))
data['Summary'][0][:200]

'old major the old boar on the manor farm calls the animals on the farm for a meeting where he compares the humans to parasites and teaches the animals a revolutionary song beasts of england when major'

In [203]:
# from natural language toolkit downloading stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [204]:
# converting genre values which are of type list to a single space seperated string
data['GenreString'] = data['Genre'].apply(lambda x: ' '.join(x))

In [205]:
data['GenreString'][0]

"Roman à clef Satire Children's literature Speculative fiction Fiction"

In [206]:
#get a combined text that includes author's name and associated genres
data["combined_text"] = data["Summary"] + " " + data["Author"] + " " + data["GenreString"]

In [207]:
data["combined_text"]

0        old major the old boar on the manor farm calls...
1        alex a teenager living in near future england ...
2        the text of the plague is divided into five pa...
4        the novel posits that space around the milky w...
5        the book tells the story of paul b umer a germ...
                               ...                        
16549    the story starts with former government agent ...
16551    the series follows the character of nick stone...
16555    the reader first meets rapp while he is doing ...
16556    the book follows very rough chronological orde...
16558    makar devushkin and varvara dobroselova are se...
Name: combined_text, Length: 12055, dtype: object

In [208]:
# removing stopwords
stopwords_english = stopwords.words("english")
# data['text_without_stopwords'] = data['combined_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_english)]))
data['text_without_stopwords'] = data['combined_text'].apply(lambda x: [word for word in x.split() if word not in (stopwords_english)])

In [209]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem_list(text):
  # Create an empty list to store the stems
  text_stem = [] 

  for word in text:
      stem_word = stemmer.stem(word)  # stemming word
      text_stem.append(stem_word)  # append to the list
  return ' '.join(text_stem)

In [210]:
data['text_without_stopwords'] = data['text_without_stopwords'].apply(stem_list)

In [211]:
data['text_without_stopwords'][0]

"old major old boar manor farm call anim farm meet compar human parasit teach anim revolutionari song beast england major die two young pig snowbal napoleon assum command turn dream philosophi anim revolt drive drunken irrespons mr jone farm renam anim farm adopt seven command anim ism import anim equal snowbal attempt teach anim read write food plenti farm run smoothli pig elev posit leadership set asid special food item ostens person health napoleon take pup farm dog train privat napoleon snowbal struggl leadership snowbal announc plan build windmil napoleon dog chase snowbal away declar leader napoleon enact chang govern structur farm replac meet committe pig run farm use young pig name squealer mouthpiec napoleon claim credit windmil idea anim work harder promis easier live windmil violent storm anim find windmil annihil napoleon squealer convinc anim snowbal destroy although scorn neighbour farmer suggest wall thin snowbal becom scapegoat napoleon begin purg farm dog kill anim acc

## Building a document term matrix

In [212]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
# Building a document term matrix
count_matrix = cv.fit_transform(data['text_without_stopwords'])

In [213]:
count_matrix.shape

(12055, 75823)

## creating Cosine Similarity matrix

In [214]:
from sklearn.metrics.pairwise import cosine_similarity
cosine = cosine_similarity(count_matrix, count_matrix)

## Recommendation system based on cosine similarity

In [215]:
def get_title_from_index(Index):
    return data[data.Index == Index].BookTitle.values[0]
def get_index_from_title(BookTitle):
    return data[data.BookTitle == BookTitle]["Index"].values[0]

def get_recommendations(book):
    book_index = get_index_from_title(book)
    similar_books = list(enumerate(cosine[book_index]))
    # print(similar_books)
    sortedbooks = sorted(similar_books, key = lambda x:x[1], reverse=True)[1:]
    # print(sortedbooks)
    i = 0
    for book in sortedbooks:
        if data[data.Index==book[0]].shape[0]==0:
          continue;
        title = data[data.Index==book[0]].BookTitle
        author = data[data.Index==book[0]].Author
        print(title + " by " + author)
        i = i+1
        if i>10:
            break

## Getting Recommendations

In [216]:
print(get_recommendations('Dune'))

1937    Idoru by William Gibson
dtype: object
4651    Sassinak by Anne McCaffrey
dtype: object
11272    My Booky Wook by Russell Brand
dtype: object
2179    Snow Country by Yasunari Kawabata
dtype: object
7355    King, Queen, Knave by Vladimir Vladimirovich N...
dtype: object
823    Family Matters by Rohinton Mistry
dtype: object
10743    The Murderer is a Fox by Frederic Dannay
dtype: object
7937    Star-Begotten by H. G. Wells
dtype: object
10626    Misquoting Jesus by Bart D. Ehrman
dtype: object
2354    The Gold Bug Variations by Richard Powers
dtype: object
5708    Last of the Gaderene by Mark Gatiss
dtype: object
None


In [217]:
print(get_recommendations('A Fire Upon the Deep'))

667    Trainspotting by Irvine Welsh
dtype: object
254    The Body by Stephen King
dtype: object
727    Trumps of Doom by Roger Zelazny
dtype: object
9380    The Strategy Paradox by Michael E. Raynor
dtype: object
1347    Congo by Michael Crichton
dtype: object
426    War and Peace by Leo Tolstoy
dtype: object
863    The Master and Margarita by Mikhail Bulgakov
dtype: object
6541    Tragedy Day by Gareth Roberts
dtype: object
1937    Idoru by William Gibson
dtype: object
5593    1633 by Eric Flint
dtype: object
9063    Appley Dapply's Nursery Rhymes by Beatrix Potter
dtype: object
None
