In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse import csr_matrix

In [2]:
df = pd.read_csv('books.csv')

In [3]:
df.shape

(12372, 6)

In [4]:
pd.set_option('display.max_columns', df.shape[1])

In [5]:
df.head(5)

Unnamed: 0,url,title,description,genre,publishing house,author
0,https://alinino.az//product/where-is-little-fi...,Where Is Little Fish?,Play hide-and-seek with Lucy Cousins's Little ...,Activity books,Walker Books Ltd,Lucy Cousins
1,https://alinino.az//product/uncommon-type-some...,Uncommon Type: Some Stories,A gentle Eastern European immigrant arrives in...,Romance,Audiobooks,Tom Hanks
2,https://alinino.az//product/trumpocracy-the-co...,Trumpocracy: The Corruption of the American Re...,"""From Russia to South Africa, from Turkey to t...",Politics,Harper,David Frum
3,https://alinino.az//product/the-storyteller-of...,The Storyteller of Casablanca,"Morocco, 1941. With France having fallen to Na...",Romance,Lake Union Publishing,Fiona Valpy
4,https://alinino.az//product/the-secret-commonw...,The Secret Commonwealth: The Book of Dust Volu...,It is twenty years since the events of La Bell...,Dedective,Penguin and David Fickling Books,Philip Pullman


In [6]:
df.isnull().mean()

url                 0.000000
title               0.000000
description         0.005092
genre               0.063126
publishing house    0.053750
author              0.131992
dtype: float64

In [7]:
df.duplicated().sum()

0

In [8]:
df = df.drop_duplicates()

In [9]:
df.shape

(12372, 6)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12372 entries, 0 to 12371
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   url               12372 non-null  object
 1   title             12372 non-null  object
 2   description       12309 non-null  object
 3   genre             11591 non-null  object
 4   publishing house  11707 non-null  object
 5   author            10739 non-null  object
dtypes: object(6)
memory usage: 676.6+ KB


In [11]:
df.columns

Index(['url', 'title', 'description', 'genre', 'publishing house', 'author'], dtype='object')

In [12]:
columns_to_delete = ['url']
df = df.drop(columns=columns_to_delete)

In [13]:
df.isna().sum()

title                  0
description           63
genre                781
publishing house     665
author              1633
dtype: int64

In [14]:
df.shape

(12372, 5)

In [15]:
df.rename(columns={'publishing house': 'publisher'}, inplace=True)

In [16]:
df

Unnamed: 0,title,description,genre,publisher,author
0,Where Is Little Fish?,Play hide-and-seek with Lucy Cousins's Little ...,Activity books,Walker Books Ltd,Lucy Cousins
1,Uncommon Type: Some Stories,A gentle Eastern European immigrant arrives in...,Romance,Audiobooks,Tom Hanks
2,Trumpocracy: The Corruption of the American Re...,"""From Russia to South Africa, from Turkey to t...",Politics,Harper,David Frum
3,The Storyteller of Casablanca,"Morocco, 1941. With France having fallen to Na...",Romance,Lake Union Publishing,Fiona Valpy
4,The Secret Commonwealth: The Book of Dust Volu...,It is twenty years since the events of La Bell...,Dedective,Penguin and David Fickling Books,Philip Pullman
...,...,...,...,...,...
12367,Charlie and the Chocolate Factory Audio CD,Puffin Audiobooks presents Roald Dahl's classi...,,Puffin,Roald Dahl
12368,Case Histories Audio CD,It’s a sweltering summer in Cambridge as forme...,Детектив/Detektiv,Audiobooks,Kate Atkinson
12369,The Cobra Audio CD,The Cocaine industry is worth billions of doll...,Фантастика/Fantastika,Audiobooks,Frederick Forsyth
12370,Boy in Striped Pyjamas Audio CD,,Художественная литература/Bədii ədəbiyyat,Random House,Michael Maloney (read by) John Boyne (author)


In [17]:
#combining columns
df['Description'] = df.apply(lambda row: ' '.join(str(val) for val in row if not pd.isna(val)), axis=1)

#removing unnecessary columns
df.drop(['description', 'genre', 'publisher','author'], axis=1, inplace=True)

In [18]:
df.isna().sum()

title          0
Description    0
dtype: int64

In [19]:
df

Unnamed: 0,title,Description
0,Where Is Little Fish?,Where Is Little Fish? Play hide-and-seek with ...
1,Uncommon Type: Some Stories,Uncommon Type: Some Stories A gentle Eastern E...
2,Trumpocracy: The Corruption of the American Re...,Trumpocracy: The Corruption of the American Re...
3,The Storyteller of Casablanca,"The Storyteller of Casablanca Morocco, 1941. W..."
4,The Secret Commonwealth: The Book of Dust Volu...,The Secret Commonwealth: The Book of Dust Volu...
...,...,...
12367,Charlie and the Chocolate Factory Audio CD,Charlie and the Chocolate Factory Audio CD Puf...
12368,Case Histories Audio CD,Case Histories Audio CD It’s a sweltering summ...
12369,The Cobra Audio CD,The Cobra Audio CD The Cocaine industry is wor...
12370,Boy in Striped Pyjamas Audio CD,Boy in Striped Pyjamas Audio CD Художественная...


In [20]:
df.isna().sum()

title          0
Description    0
dtype: int64

In [21]:
def get_recommendations(input_text, df, cosine_similarities=None):
    # Use the existing cosine similarities if available
    if cosine_similarities is None:
        tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf_vectorizer.fit_transform(df['Description'])
        cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

    # Compute similarity with the input text
    input_vector = tfidf_vectorizer.transform([input_text])
    sim_scores = linear_kernel(input_vector, tfidf_matrix).flatten()

    # Get top 5 similar books
    sim_scores = sorted(enumerate(sim_scores), key=lambda x: x[1], reverse=True)[1:6]
    book_indices = [i[0] for i in sim_scores]

    # Return recommended books
    return df['title'].iloc[book_indices]

In [25]:
# Example usage:
input_text = "Artificial intelligence"
recommended_books = get_recommendations(input_text, df)
print(recommended_books)

8259    Artificial Intelligence: A Guide for Thinking ...
7678               Beginners Plus Artificial Intelligence
3871                                           TrooFriend
8778                   The Power of Physical Intelligence
5575                   Artificial Intelligence Revolution
Name: title, dtype: object
