# Find book similarities with cosine similarity

- - - 

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
books = pd.read_csv('./Data/Books_preprocessed.csv')
books_descriptions = books['description']

In [3]:
suma = 0
for i in books['genres']:
    if 'Fiction' in i:
        suma += 1
print(suma, len(books))

2741 13152


In [4]:
for i in range(len(books)):
    print(f'{books["title"][i]}: {books["genres"][i]}')

The Hunger Games: ['Fantasy', 'Science Fiction', 'Romance']
Harry Potter and the Order of the Phoenix: ['Fantasy']
Pride and Prejudice: ['Romance']
Twilight: ['Fantasy', 'Romance']
J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings: ['Fantasy']
Gone with the Wind: ['Romance']
The Fault in Our Stars: ['Romance']
The Hitchhiker's Guide to the Galaxy: ['Science Fiction', 'Fantasy']
Wuthering Heights: ['Romance']
Memoirs of a Geisha: ['Romance']
Jane Eyre: ['Romance']
Les Misérables: ['Romance']
Divergent: ['Fantasy', 'Science Fiction', 'Romance']
The Perks of Being a Wallflower: ['Romance']
The Great Gatsby: ['Romance']
City of Bones: ['Fantasy', 'Romance']
Harry Potter and the Sorcerer's Stone: ['Fantasy']
The Time Traveler's Wife: ['Romance', 'Fantasy', 'Science Fiction']
Dracula: ['Horror', 'Fantasy']
One Hundred Years of Solitude: ['Fantasy']
The Princess Bride: ['Fantasy', 'Romance']
The Lightning Thief: ['Fantasy']
The Secret Garden: ['Fantasy']
A Wrinkle in Time:

- - - 

In [5]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


# Define functions for text preprocessing
def preprocess_text(text):
    
    # Lower and remove numbers and punctuation
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d', '', text)
    text = ' '.join(text.split())
    
    # Remove stopwords
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    
    # Lemmatize
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    return ' '.join(lemmatized_words)


# Apply preprocessing, stop word removal, and lemmatization to each description
preprocessed_descriptions = [preprocess_text(desc) for desc in books_descriptions]

In [6]:
# Vectorize preprocessed book descriptions using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_descriptions)

In [7]:
# Calculate cosine similarity between descriptions
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Print the similarity matrix
print(cosine_similarities)

[[1.         0.         0.00828443 ... 0.00843651 0.0028751  0.02716902]
 [0.         1.         0.         ... 0.00228359 0.01374134 0.00395555]
 [0.00828443 0.         1.         ... 0.         0.         0.01831067]
 ...
 [0.00843651 0.00228359 0.         ... 1.         0.44265752 0.        ]
 [0.0028751  0.01374134 0.         ... 0.44265752 1.         0.        ]
 [0.02716902 0.00395555 0.01831067 ... 0.         0.         1.        ]]


In [8]:
similar_books_indices = cosine_similarities.argsort()[1][::-1]
print(similar_books_indices)

[   1   51   39 ... 6069 6067    0]


In [9]:
for idx in similar_books_indices:
    print(f"Similarity: {cosine_similarities[1][idx]}, Book: {books['title'][idx]}, Saga: {books['series'][idx]}")

Similarity: 0.9999999999999999, Book: Harry Potter and the Order of the Phoenix, Saga: Harry Potter #5
Similarity: 0.25375004205720875, Book: Harry Potter and the Chamber of Secrets, Saga: Harry Potter #2
Similarity: 0.23934086544813232, Book: Harry Potter and the Prisoner of Azkaban, Saga: Harry Potter #3
Similarity: 0.18971769595368115, Book: Harry Potter Schoolbooks Box Set: Two Classic Books from the Library of Hogwarts School of Witchcraft and Wizardry, Saga: Hogwarts Library
Similarity: 0.17918572289491363, Book: Harry Potter and the Goblet of Fire, Saga: Harry Potter #4
Similarity: 0.17556408560699652, Book: Harry Potter and the Sorcerer's Stone, Saga: Harry Potter #1
Similarity: 0.14977091491764954, Book: Hogwarts: An Incomplete and Unreliable Guide, Saga: Pottermore Presents #3
Similarity: 0.14432880745876467, Book: Fantastic Beasts and Where to Find Them, Saga: Hogwarts Library
Similarity: 0.14335834779468395, Book: Harry Potter and the Half-Blood Prince, Saga: Harry Potter #

- - - 