# Importing the libraries

In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
import warnings
warnings.filterwarnings("ignore")

# Importing the dataset

In [34]:
data=pd.read_csv("/kaggle/input/book-recommendation-dataset/Books.csv")

# Checking the columns for merging

In [35]:
data.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [36]:
data.duplicated().sum()

0

# Keeping the necessary columns

In [37]:
data=data[["Book-Title", "Book-Author", "Publisher"]]

In [38]:
data

Unnamed: 0,Book-Title,Book-Author,Publisher
0,Classical Mythology,Mark P. O. Morford,Oxford University Press
1,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada
2,Decision in Normandy,Carlo D'Este,HarperPerennial
3,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,Farrar Straus Giroux
4,The Mummies of Urumchi,E. J. W. Barber,W. W. Norton &amp; Company
...,...,...,...
271355,There's a Bat in Bunk Five,Paula Danziger,Random House Childrens Pub (Mm)
271356,From One to One Hundred,Teri Sloat,Dutton Books
271357,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,HarperSanFrancisco
271358,Republic (World's Classics),Plato,Oxford University Press


# Accounting for null values

In [39]:
data.isnull().sum()

Book-Title     0
Book-Author    2
Publisher      2
dtype: int64

In [40]:
data.dropna(axis=0, inplace=True)

In [41]:
data.isnull().sum()

Book-Title     0
Book-Author    0
Publisher      0
dtype: int64

# Splitting the dataset into a smaller dataset

In [42]:
data=data.iloc[0:5000,:]

In [43]:
data.shape

(5000, 3)

In [44]:
data.head()

Unnamed: 0,Book-Title,Book-Author,Publisher
0,Classical Mythology,Mark P. O. Morford,Oxford University Press
1,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada
2,Decision in Normandy,Carlo D'Este,HarperPerennial
3,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,Farrar Straus Giroux
4,The Mummies of Urumchi,E. J. W. Barber,W. W. Norton &amp; Company


# Accounting for spaces between the words

In [45]:
def collapse(text):
    l=[]
    l.append(text.replace(" ", ""))
    return l
        

In [46]:
data["Book-Title"]=data["Book-Title"].apply(collapse)
data["Book-Author"]=data["Book-Author"].apply(collapse)
data["Publisher"]=data["Publisher"].apply(collapse)

In [47]:
data.head()

Unnamed: 0,Book-Title,Book-Author,Publisher
0,[ClassicalMythology],[MarkP.O.Morford],[OxfordUniversityPress]
1,[ClaraCallan],[RichardBruceWright],[HarperFlamingoCanada]
2,[DecisioninNormandy],[CarloD'Este],[HarperPerennial]
3,[Flu:TheStoryoftheGreatInfluenzaPandemicof1918...,[GinaBariKolata],[FarrarStrausGiroux]
4,[TheMummiesofUrumchi],[E.J.W.Barber],[W.W.Norton&amp;Company]


# Converting list to string

In [48]:
data["Book-Title"]=data["Book-Title"].apply(lambda x: " ".join(x))
data["Book-Author"]=data["Book-Author"].apply(lambda x: " ".join(x))
data["Publisher"]=data["Publisher"].apply(lambda x: " ".join(x))

In [49]:
data.head()

Unnamed: 0,Book-Title,Book-Author,Publisher
0,ClassicalMythology,MarkP.O.Morford,OxfordUniversityPress
1,ClaraCallan,RichardBruceWright,HarperFlamingoCanada
2,DecisioninNormandy,CarloD'Este,HarperPerennial
3,Flu:TheStoryoftheGreatInfluenzaPandemicof1918a...,GinaBariKolata,FarrarStrausGiroux
4,TheMummiesofUrumchi,E.J.W.Barber,W.W.Norton&amp;Company


# Merging the related columns to one column

In [50]:
data['book-details']=data["Book-Author"]+data["Publisher"]
data.drop(columns=["Book-Author", "Publisher"], inplace=True)

In [51]:
data.head()

Unnamed: 0,Book-Title,book-details
0,ClassicalMythology,MarkP.O.MorfordOxfordUniversityPress
1,ClaraCallan,RichardBruceWrightHarperFlamingoCanada
2,DecisioninNormandy,CarloD'EsteHarperPerennial
3,Flu:TheStoryoftheGreatInfluenzaPandemicof1918a...,GinaBariKolataFarrarStrausGiroux
4,TheMummiesofUrumchi,E.J.W.BarberW.W.Norton&amp;Company


# Converting text to number

In [52]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(stop_words="english")
vector=cv.fit_transform(data["book-details"]).toarray()
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [53]:
vector.shape

(5000, 4566)

# Building cosine similarity between the number array

In [54]:
from sklearn.metrics.pairwise import cosine_similarity

similarity=cosine_similarity(vector)

In [55]:
data[data["Book-Title"]=="ClassicalMythology"].index[0]

0

# Building the book recommendation function

In [56]:
def book_recommendation_system(book_title):
    book_title=book_title.replace(" ", "")
    index=data[data["Book-Title"]==book_title].index[0]
    distance=sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x:x[1])
    for i in distance[1:6]:
        print(data.iloc[i[0]]["Book-Title"])

In [57]:
book_recommendation_system("PLEADING GUILTY")

ClassicalMythology
ClaraCallan
DecisioninNormandy
Flu:TheStoryoftheGreatInfluenzaPandemicof1918andtheSearchfortheVirusThatCausedIt
TheMummiesofUrumchi
