In [None]:
pip install pandas

In [None]:
pip install matplotlib

In [None]:
pip install scikit-learn

In [None]:
pip install -U sentence-transformers

In [None]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import array 
import sklearn
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer as sentran

In [6]:
def isbn10_to_isbn13(isbn10):
    if len(isbn10) < 10:
        isbn10 = isbn10.zfill(10)  # Pad with leading zeros if necessary
    
    # Add prefix '978' to convert ISBN-10 to ISBN-13
    isbn13_prefix = '978' + isbn10[:-1]
    
    # Calculate the check digit for ISBN-13
    check_sum = sum((1 if i % 2 == 0 else 3) * int(digit) for i, digit in enumerate(isbn13_prefix))
    check_digit = (10 - (check_sum % 10)) % 10
    
    return isbn13_prefix + str(check_digit)



In [7]:
#function to clean up genre information in the data frame
def genre_str_to_arr(genre):
    genre = genre[2:-2].replace("\'", "")
    genre = genre.split(", ")

    return genre

In [9]:
#function to find cosine similarity
def sim_score_(emb1, emb2):
    return np.dot(emb1, emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2))


In [10]:
#books data frame that will be worked with is booksdf
books = "/Users/oreoluwaala/Documents/Book Recommendation Project/rawdata/books.csv" 
ratings = "/Users/oreoluwaala/Documents/Book Recommendation Project/rawdata/ratings.csv" 
tbr = "/Users/oreoluwaala/Documents/Book Recommendation Project/rawdata/to_read.csv" 

#fill empty isbn values with integer representation
books_dataframe = pd.read_csv(books)
books_dataframe['isbn'] = books_dataframe['isbn'].fillna('')

#booksdf creation
booksdf = books_dataframe.copy()
booksdf = booksdf[["title", "authors", "average_rating", "book_id","original_publication_year", "genres", "pages", "description"]]

#change year column to basic integer
booksdf.loc[:, 'original_publication_year'] = booksdf['original_publication_year'].fillna(-1)
booksdf.loc[:, 'original_publication_year'] = booksdf['original_publication_year'].astype(int)

#change isbn10 to 13 with the function above

booksdf.loc[:, 'isbn13'] = books_dataframe['isbn'].apply(isbn10_to_isbn13)

#add isbn13 to the dataframe
booksdf = booksdf[["title", "authors", "book_id","isbn13", "average_rating","original_publication_year", "genres", "pages", "description", ]]

#change genre to array in dataframe
booksdf.loc[:, "genres"] = booksdf["genres"].apply(genre_str_to_arr)

#ratings dataframe
ratingsdf = pd.read_csv(ratings)

#to be read dataframe
tbrdf = pd.read_csv(tbr)


In [None]:
#recommendation based on similar books via description
#creating booksdataframe
books_dataframe = booksdf.iloc[:1000]
books_dataframe.head()

In [28]:
#load model to generate sentence embeddings
sbert_model = sentran('bert-base-nli-mean-tokens')

In [29]:
#generate sentence embeddings and keep track of the embedding/book index to isbn ratio

# convert descriptions to a list
descriptions = books_dataframe["description"].tolist()

# define batch size
batch_size = 32 

# compute embeddings in batches
embeddings = []
for i in range(0, len(descriptions), batch_size):
    batch = descriptions[i:i+batch_size]
    batch_embeddings = sbert_model.encode(batch)
    embeddings.append(batch_embeddings)

# Concatenate all batches into a single array
embeddings = np.vstack(embeddings)

#print(embeddings)

# embeddings = []
# for idx, desc in enumerate(books_dataframe["description"]):
#     embeddings.append(sbert_model.encode(desc))

In [30]:
#generate genre embedddings
genre_emb = {}
genre_map = {'art': 0, 'biography': 1, 'books': 2, 'business': 3, 'chick-lit': 4, 'christian': 5, 'classics': 6, 'comics': 7, 'contemporary': 8, 'cookbooks': 9, 'crime': 10, 'fantasy': 11, 'fiction': 12, 'gay-and-lesbian': 13, 'graphic-novels': 14, 'historical-fiction': 15, 'history': 16, 'horror': 17, 'humor-and-comedy': 18, 'manga': 19, 'memoir': 20, 'music': 21, 'mystery': 22, 'nonfiction': 23, 'paranormal': 24, 'philosophy': 25, 'poetry': 26, 'psychology': 27, 'religion': 28, 'romance': 29, 'science': 30, 'science-fiction': 31, 'self-help': 32, 'spirituality': 33, 'sports': 34, 'suspense': 35, 'thriller': 36, 'travel': 37, 'young-adult': 38}

In [31]:
for i, genres in enumerate(books_dataframe["genres"]):
    genre_emb[i] = [float(0)] * 39

In [None]:
for i, genres in enumerate(books_dataframe["genres"]):
    l = len(genres)
    for k, genre in enumerate(genres): 
        genre_emb[i][genre_map[genre]] = (l-k)*100/sum(range(l + 1))

genre_list = [0] * len(genre_emb)
for key, values in genre_emb.items():
    genre_list[key]= values

for i, list in enumerate(genre_list):
    print(len(genre_list[i]))
    print(genre_list[i])
    

In [None]:
kmeans_genre = KMeans(n_clusters = 9, random_state = 2, n_init="auto")
kmeans_genre.fit(genre_list)

In [34]:
cluster_map = {}
for idx, clusteridx in enumerate(kmeans_genre.labels_):
    if clusteridx in cluster_map:
        cluster_map[clusteridx].append(idx)
    else:
        cluster_map[clusteridx] = [idx]

In [35]:
new_bk_idx = 0

In [None]:
#predicting for new entry based on genre and filtered through description similarity
newdesc = booksdf.iloc[new_bk_idx]["description"]
newtitle = booksdf.iloc[new_bk_idx]["title"]
newgenre = booksdf.iloc[new_bk_idx]["genres"]
newgenre_emb = [float(0)] * 39
print(newdesc, newtitle, newgenre)

In [37]:
#getting the new genre embedding
for k, genre in enumerate(newgenre): 
    newgenre_emb[genre_map[genre]] = (l-k)*100/sum(range(l + 1))

In [None]:
#getting what cluster it belongs to
cluster = kmeans_genre.predict([newgenre_emb])[0]
for i in cluster_map[cluster]:
    print(books_dataframe.loc[i]["title"])
    print(books_dataframe.loc[i]["isbn13"])

In [39]:
#new entry embedding based on description
newbook_emb = sbert_model.encode(newdesc)

In [40]:
# Find cosine similarity
similarity_score = [sim_score_(emb, newbook_emb) for emb in embeddings]

#sort keeping index
recsidx = np.array(similarity_score).argsort()[::-1][:100]


In [None]:
#filterng genre cluster info through book description cosine similary info
i = 0
for idx in recsidx:
    if idx in cluster_map[cluster] and i <10:
        i += 1
        title = books_dataframe.loc[idx]["title"]
        isbn13 = books_dataframe.loc[idx]["isbn13"]
        print(title, similarity_score[idx], idx)




In [None]:
ratingsdf.head()

In [None]:
tbrdf.head()