In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer
#it is designed to convert a collection of raw text documents into numerical feature vectors,
# representing the relative importance of each word or term in the documents.
from sklearn.metrics.pairwise import cosine_similarity
#it is commonly used to assess the similarity between two text documents represented as numerical vectors, 
# such as TF-IDF vectors.
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



/kaggle/input/book-recommendation-dataset/Ratings.csv
/kaggle/input/book-recommendation-dataset/Users.csv
/kaggle/input/book-recommendation-dataset/Books.csv
/kaggle/input/book-recommendation-dataset/recsys_taxonomy2.png
/kaggle/input/7k-books-with-metadata/books.csv


In [2]:
books_content= pd.read_csv('/kaggle/input/7k-books-with-metadata/books.csv')
book= pd.read_csv('/kaggle/input/book-recommendation-dataset/Books.csv', low_memory=False)
users=pd.read_csv('/kaggle/input/book-recommendation-dataset/Users.csv')
rating=pd.read_csv('/kaggle/input/book-recommendation-dataset/Ratings.csv')

In [3]:
books_content["description"].head()
books_content[books_content["description"].isnull()].head()
books_content['description'] = books_content['description'].fillna('') 

In [4]:
print(books_content)

             isbn13      isbn10                      title  \
0     9780002005883  0002005883                     Gilead   
1     9780002261982  0002261987               Spider's Web   
2     9780006163831  0006163831               The One Tree   
3     9780006178736  0006178731             Rage of angels   
4     9780006280897  0006280897             The Four Loves   
...             ...         ...                        ...   
6805  9788185300535  8185300534                  I Am that   
6806  9788185944609  8185944601       Secrets Of The Heart   
6807  9788445074879  8445074873             Fahrenheit 451   
6808  9789027712059  9027712050   The Berlin Phenomenology   
6809  9789042003408  9042003405  'I'm Telling You Stories'   

                                            subtitle  \
0                                                NaN   
1                                            A Novel   
2                                                NaN   
3                              

In [5]:
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(books_content['description'])
tfidf_matrix.shape
tfidf.get_feature_names_out()[0:10]
tfidf_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [6]:
cosine_sim = cosine_similarity(tfidf_matrix,
                               tfidf_matrix)
indices = pd.Series(books_content.index, index=books_content['title'])
indices.index.value_counts()
indices = indices[~indices.index.duplicated(keep='last')]
book_index = indices["Star Wars"]
similarity_scores = pd.DataFrame(cosine_sim[book_index], columns=["score"])

In [7]:
book_indices = similarity_scores.sort_values("score", ascending=False)[1:6].index

In [8]:
books_content['title'].iloc[book_indices]

4623                 You Can Draw Star Wars
5151              The Star Wars Poster Book
6003    The Sideways Guide to Wine and Life
4476                       Grand Theft Auto
6146               Star Wars Tales Volume 1
Name: title, dtype: object

In [9]:
def content_based_recommender(title, cosine_sim, dataframe):
    indices = pd.Series(dataframe.index, index=dataframe['title'])
    indices = indices[~indices.index.duplicated(keep='last')]
    book_index = indices[title]
    similarity_scores = pd.DataFrame(cosine_sim[book_index], columns=["score"])
    book_indices = similarity_scores.sort_values("score", ascending=False)[1:6].index
    return dataframe['title'].iloc[book_indices]
content_based_recommender("Hannibal", cosine_sim, books_content)

2656    The Hannibal Lecter Trilogy
6499      The Patron Saint of Liars
1192           Agricola and Germany
11            Assassin's Apprentice
3364                         Strata
Name: title, dtype: object

In [10]:
book.rename(columns={
    "ISBN":"ID",
    "Book-Title":"Title",
    "Book-Author":"Author",
    "Year-Of-Publication":"Year",
    "Publisher":"Publisher",
    "Image-URL-S":"Image-url"
    },inplace=True)

In [11]:
rating.columns


Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')

In [12]:
rating.rename(columns={
    "User-ID":"user_id",
    "ISBN":"ID",
    "Book-Rating":"Rating"
}, inplace=True)

In [13]:
x=rating['user_id'].value_counts()>200

In [14]:
y=x[x].index
ratings=rating[rating['user_id'].isin(y)]
rating_with_books=rating.merge(book, on ="ID")

In [15]:
num_rating= rating_with_books.groupby('Title')['Rating'].count().reset_index()
num_rating.rename(columns={
    "Rating":"Number_of_rating",
}, inplace=True)
num_rating

Unnamed: 0,Title,Number_of_rating
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [16]:
final_rating=rating_with_books.merge(num_rating,on ='Title')
final_rating=final_rating[final_rating['Number_of_rating']>=50]
final_rating=final_rating.drop_duplicates({'user_id','Title'})


In [17]:
book_pivot=final_rating.pivot_table(columns='user_id', index='Title',values='Rating')


In [18]:
book_pivot.fillna(0,inplace=True)


In [19]:
from scipy.sparse import csr_matrix
book_sparse=csr_matrix(book_pivot)
from sklearn.neighbors import NearestNeighbors
model=NearestNeighbors(algorithm='brute')
model.fit(book_sparse)

In [20]:
distance,suggestion=model.kneighbors(book_pivot.iloc[237,:].values.reshape(1,-1),n_neighbors=6)

In [21]:
def recommend_book(book_name):
    book_id = np.where(book_pivot.index == book_name)[0][0]
    distance, suggestion = model.kneighbors(book_pivot.iloc[book_id, :].values.reshape(1, -1), n_neighbors=6)
    for i in range(len(suggestion)):
        books_indices = suggestion[i]
        for index in books_indices:
            book = book_pivot.index[index]
            print(book)
def content_based_recommender(title, cosine_sim, dataframe):
    indices = pd.Series(dataframe.index, index=dataframe['title'])
    indices = indices[~indices.index.duplicated(keep='last')]
    book_index = indices[title]
    similarity_scores = pd.DataFrame(cosine_sim[book_index], columns=["score"])
    book_indices = similarity_scores.sort_values("score", ascending=False)[1:6].index
    return dataframe['title'].iloc[book_indices]
def hybrid_recommendation(book_name, cosine_sim, dataframe):
    if book_name in book_pivot.index:
        num_ratings = final_rating[final_rating['Title'] == book_name]['Number_of_rating'].values[0]
        if num_ratings >= 50:
            return recommend_book(book_name)
    return content_based_recommender(book_name,cosine_sim,books_content)
book_name = 'Grand Theft Auto'
hybrid_recommendation(book_name,cosine_sim, books_content)

4622                 Star Wars
4623    You Can Draw Star Wars
4481               Castlevania
6783    Basics Illustration 01
6274        M is for Mayflower
Name: title, dtype: object

In [22]:
import  pickle
pickle.dump(model,open('model.pkl','wb'))
pickle.dump(book_name,open('books_content.pkl','wb'))
pickle.dump(final_rating,open('final_rating.pkl','wb'))
pickle.dump(book_pivot,open('book_pivot.pkl','wb'))
pickle.dump(cosine_sim,open('cosine_sim.pkl','wb'))