In [1]:
# Imput sample data as a pandas data frame
import pandas as pandas
df = books = [
    {"id": 1, "title": "To Kill a Mockingbird", "author": "Harper Lee", "genre": "Fiction", "themes": "Coming-of-age, Racism, Injustice", "era": "20th Century"},
    {"id": 2, "title": "1984", "author": "George Orwell", "genre": "Science Fiction", "themes": "Dystopia, Totalitarianism, Surveillance", "era": "20th Century"},
    {"id": 3, "title": "Pride and Prejudice", "author": "Jane Austen", "genre": "Romance", "themes": "Social class, Love, Marriage", "era": "19th Century"},
    {"id": 4, "title": "The Great Gatsby", "author": "F. Scott Fitzgerald", "genre": "Fiction", "themes": "American Dream, Wealth, Decadence", "era": "20th Century"},
    {"id": 5, "title": "Harry Potter and the Sorcerer's Stone", "author": "J.K. Rowling", "genre": "Fantasy", "themes": "Magic, Friendship, Coming-of-age", "era": "20th Century"},
    {"id": 6, "title": "The Catcher in the Rye", "author": "J.D. Salinger", "genre": "Fiction", "themes": "Alienation, Growing up, Identity", "era": "20th Century"},
    {"id": 7, "title": "The Hobbit", "author": "J.R.R. Tolkien", "genre": "Fantasy", "themes": "Adventure, Heroism, Greed", "era": "20th Century"},
    {"id": 8, "title": "Brave New World", "author": "Aldous Huxley", "genre": "Science Fiction", "themes": "Dystopia, Technology, Freedom", "era": "20th Century"},
    {"id": 9, "title": "Jane Eyre", "author": "Charlotte Bronte", "genre": "Romance", "themes": "Love, Independence, Social criticism", "era": "19th Century"},
    {"id": 10, "title": "The Da Vinci Code", "author": "Dan Brown", "genre": "Thriller", "themes": "Conspiracy, Religion, Art", "era": "21st Century"},
    {"id": 11, "title": "The Hunger Games", "author": "Suzanne Collins", "genre": "Young Adult", "themes": "Dystopia, Survival, Rebellion", "era": "21st Century"},
    {"id": 12, "title": "The Alchemist", "author": "Paulo Coelho", "genre": "Fiction", "themes": "Self-discovery, Destiny, Spirituality", "era": "20th Century"},
    {"id": 13, "title": "The Girl with the Dragon Tattoo", "author": "Stieg Larsson", "genre": "Mystery", "themes": "Crime, Corruption, Feminism", "era": "21st Century"},
    {"id": 14, "title": "The Kite Runner", "author": "Khaled Hosseini", "genre": "Historical Fiction", "themes": "Redemption, Friendship, Culture", "era": "21st Century"},
    {"id": 15, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "genre": "Science Fiction", "themes": "Humor, Absurdity, Space travel", "era": "20th Century"},
    {"id": 16, "title": "The Fault in Our Stars", "author": "John Green", "genre": "Young Adult", "themes": "Love, Illness, Coming-of-age", "era": "21st Century"},
    {"id": 17, "title": "The Picture of Dorian Gray", "author": "Oscar Wilde", "genre": "Gothic Fiction", "themes": "Beauty, Corruption, Hedonism", "era": "19th Century"},
    {"id": 18, "title": "The Martian", "author": "Andy Weir", "genre": "Science Fiction", "themes": "Survival, Science, Isolation", "era": "21st Century"},
    {"id": 19, "title": "One Hundred Years of Solitude", "author": "Gabriel García Márquez", "genre": "Magical Realism", "themes": "Family, Time, Solitude", "era": "20th Century"},
    {"id": 20, "title": "The Giver", "author": "Lois Lowry", "genre": "Young Adult", "themes": "Dystopia, Memory, Individuality", "era": "20th Century"}
]
df = pandas.DataFrame(df)
df.to_csv("books.csv", index=False)

In [2]:
# Combine relevant features (genre, themes, era) into a single text field for each book.
df['combined'] = df['genre'] + ', ' + df['themes'] + ', ' + df['era']

In [3]:
# Apply the cosine similarity on the vector we created in the previous step.
!pip install scikit-learn==1.3.0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# fitting the vector and combining text data, then get the cosine similarity matrix

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['combined'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

Collecting scikit-learn==1.3.0
  Downloading scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.2
    Uninstalling scikit-learn-1.3.2:
      Successfully uninstalled scikit-learn-1.3.2
Successfully installed scikit-learn-1.3.0


In [4]:
# Create a function that takes a book title as input and returns the top N most similar books.

def get_similar_books(title, n=5):
  # Get the index of the book that matches the title
  idx = df[df['title'] == title].index[0]

  # Get the pairwise similarity scores of all books with that book
  sim_scores = list(enumerate(cosine_sim[idx]))

  # Sort the books based on the similarity scores
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  # Get the scores of the N most similar books
  sim_scores = sim_scores[1:n+1]

  # Get the book indices
  book_indices = [i[0] for i in sim_scores]

  # Return the top N most similar books
  return df['title'].iloc[book_indices]

In [5]:
# Testing the book recommendation

print(get_similar_books('The Hobbit'))


4     Harry Potter and the Sorcerer's Stone
1                                      1984
7                           Brave New World
19                                The Giver
0                     To Kill a Mockingbird
Name: title, dtype: object


In [6]:
# Testing the book recommendation

print(get_similar_books('1984'))

7                          Brave New World
17                             The Martian
14    The Hitchhiker's Guide to the Galaxy
19                               The Giver
10                        The Hunger Games
Name: title, dtype: object
