# Book Recommender System

### Load Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from PIL import Image
import requests
from io import BytesIO

In [None]:
df = pd.read_csv("data.csv")
df.head()

In [None]:
df['description'] = df['Desc']
df = df.drop(['Desc'], axis=1)

In [None]:
df['word_count'] = df['description'].apply(lambda x: len(str(x).split()))

In [None]:
df.shape

### Visualization

In [None]:
df['genre'].value_counts()

In [None]:
sns.countplot(df['genre'], label='Count')


In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df['word_count'], label='Count', bins=50)

In [None]:
def get_top_text_ngrams(corpus, n, g):
    tf = TfidfVectorizer(ngram_range=(g, g), stop_words='english', lowercase = False)
    bag_of_words = tf.fit_transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in tf.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def show_ngrams(n, title):
    plt.figure(figsize=(16,9))
    most_common = get_top_text_ngrams(df['description'], 20, n)
    most_common = dict(most_common)
    plt.title(title)
    sns.barplot(x=list(most_common.values()), y=list(most_common.keys()))

In [None]:
show_ngrams(1, 'Unigram')

In [None]:
show_ngrams(2, 'Bigram')

In [None]:
show_ngrams(3, 'Trigram')

### Preprocess

In [None]:
def remove_stopwords(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)
def clean_text(text):
    text = "".join(i for i in text if ord(i)<128)
    text = re.sub('<.*?>', '', text)
    text = text.lower()
    text = remove_stopwords(text)
    text = remove_punctuation(text)
    return text

df['description'] = df['description'].apply(clean_text)

### Recommender

In [None]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df = 1, stop_words='english')
tfidf_matrix = tf.fit_transform(df['description'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
def get_recommendations(title):
    idx = df.loc[df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:]
    indices = [i[0] for i in sim_scores]
    return df.loc[indices][['title', 'author', 'description', 'image_link']]

In [None]:
def show_images(df):
    df = df.head()
    fig, axs = plt.subplots(1, 5, figsize=(20, 15))
    for ax, (_, row) in zip(axs, df.iterrows()):
        response = requests.get(row.image_link)
        img = Image.open(BytesIO(response.content))
        ax.imshow(img, interpolation='none')

In [None]:
rec = get_recommendations("Steve Jobs")
rec.head(10)

In [None]:
show_images(rec)

In [None]:
'''
Inspiration
1. https://www.kdnuggets.com/2020/07/building-content-based-book-recommendation-engine.html
'''