Disclaimer: The books used for this project where webscrapped from pdf drive.
Here is the link to the webscrapping notebook - https://github.com/Preshel/Webscrap/blob/main/Recommendation%20System%20-%20WebScrap.ipynb

In [1]:
import pandas as pd

In [2]:
#reading and merging the datasets

personal_development = pd.read_csv(r"C:\Users\diede\OneDrive\Desktop\webscrapping 1\Personal Development.csv")
lifestyle = pd.read_csv(r"C:\Users\diede\OneDrive\Desktop\webscrapping 1\Lifestyle.csv")
biography = pd.read_csv(r"C:\Users\diede\OneDrive\Desktop\webscrapping 1\Biography.csv")
technology = pd.read_csv(r"C:\Users\diede\OneDrive\Desktop\webscrapping 1\Technology.csv")
fiction = pd.read_csv(r"C:\Users\diede\OneDrive\Desktop\webscrapping 1\Fiction.csv")

books=pd.concat([personal_development, lifestyle, biography, technology, fiction])

# NLP preprocessing

In [3]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\diede\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\diede\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\diede\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
#merging the relevant features and dropping the irrelevant columns

books["search"] = books["title"] + " " + books["Genre"]
books.drop(columns = ["title", "Number of Pages", "Year Published", "Genre", "Unnamed: 0"], inplace=True)

#### We use only the title and genre features to build our recommender system

In [5]:
books_stopwords=stopwords.words("english") 
lemmatizer = WordNetLemmatizer()

In [6]:
#creating the function for the cleaning process

def preprocessor(search):
    search=re.sub("[^A-Za-z1-9 ]", "", search)
    search = search.lower()
    tokens = word_tokenize(search)
    clean_search=[]
    for token in tokens:
        if token not in books_stopwords:
            clean_search.append(lemmatizer.lemmatize(token))
    return " ".join(clean_search)

### The preprocessing process included:
1. Removal of regular expressions
2. Tokenization
3. Removing the stopwords
4. Getting the basewords using lematization

In [7]:
#testing the preprocessor function

test = books.iloc[5,0]
preprocessor(test)

'living light guide personal transformation self development'

In [8]:
#Applying the function to all the rows in the dataframe

books.search = books.search.apply(preprocessor)

# Model Building - Using NearestNeighbors model

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
#Using the Tfidfvectorizer method to transform the data to numerical data

vectorizer = TfidfVectorizer() 

In [11]:
#training the data and transforming it to an array

search_matrix = vectorizer.fit_transform(books.search)
search_matrix = search_matrix.toarray()

In [12]:
#setting the model to use cosine similiarity to select the nearest neighbours

nearestneighbors = NearestNeighbors(metric="cosine")

In [13]:
nearestneighbors.fit(search_matrix)

In [14]:
#defining a function that will be used for the test data

def infer(search):
    search = preprocessor(search)
    search_matrix = vectorizer.transform([search])
    return nearestneighbors.kneighbors(n_neighbors=5, X = search_matrix, return_distance=False)

In [15]:
def final(search):
    search = infer(search)
    for book in search:
        result = books.iloc[book]
        return result

In [17]:
print(final("web"))

                                      search
73                     web design technology
35  modern web design development technology
71      mastering web application technology
31            cs web design dummy technology
98     new perspective web design technology


# An attempt on model deployment using Gradio

In [18]:
import gradio as gr

In [19]:
demo = gr.Interface(
    fn=final,
    inputs=gr.Textbox(lines=1, placeholder="Find a book..."),
    outputs="text",
)
demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


