In [2]:
!pip install pandas numpy matplotlib seaborn scikit-learn openpyxl streamlit



In [3]:
# Import libraries (combined)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
import streamlit as st

In [6]:
# --- Section 1: Simple Recommender (Popularity/Weighted Score) --- #streamlit stuff
st.title("Combined Book Recommender System")
st.header("Section 1: Simple Recommender (Popularity/Weighted Score)")



DeltaGenerator()

In [8]:
#import csvs we have
ratings = pd.read_csv('Ratings.csv', encoding='cp1252')
books   = pd.read_csv('Books.csv',   encoding='cp1252')
data = pd.merge(ratings, books, on='book_id')

In [9]:
# Calculate mean and count of ratings
ratings_mean_count = data.groupby('title')['rating'].agg(['mean', 'count']).reset_index()

In [10]:
# Calculate C (mean rating across all books)
C = ratings_mean_count['mean'].mean()

In [11]:
# Calculate m (minimum votes for 90th percentile)
m = ratings_mean_count['count'].quantile(0.90)

In [19]:
# Filter qualified books
qualified = ratings_mean_count[ratings_mean_count['count'] >= m]

In [23]:
# Function for weighted rating (IMDB formula styled)
def weighted_rating(x, m=m, C=C):
    v = x['count']
    R = x['mean']
    return (v / (v + m) * R) + (m / (m + v) * C)

In [25]:
# Compute weighted scores
qualified.loc[:, 'weighted_score'] = qualified.apply(weighted_rating, axis=1)

In [26]:
# Sort and get Top 10
top_books = qualified.sort_values('weighted_score', ascending=False).head(10)

In [27]:
# Display in Streamlit
st.write("Top 10 Books:")
st.table(top_books[['title', 'mean', 'count', 'weighted_score']])



DeltaGenerator()

In [28]:
# --- Section 2: Content-Based Recommender (Genre/Description Similarity) ---
st.header("Section 2: Content-Based Recommender (Genre/Description Similarity)")
books['features'] = books['genre'] + ' ' + books['description'].fillna('')



In [29]:
# basically turns the book 'features' text into numbers (TF-IDF) so we can compare how similar the books are
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books['features'])

In [30]:
# figuring out how close the texts are to each other (so we can compare or recommend da bookies)
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [31]:
#  so that can type a book name and it points it in da list
indices = pd.Series(books.index, index=books['title']).drop_duplicates()

In [40]:
# Recommendation function
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]  # Top 5
    movie_indices = [i[0] for i in sim_scores]
    return books['title'].iloc[movie_indices]

In [41]:
# Streamlit UI for Content-Based
book_input_cb = st.text_input("Content-Based Book Title", "Harry Potter and the Sorcerer's Stone")
if st.button("Get Content-Based Recommendations"):
    try:
        recs_cb = get_recommendations(book_input_cb)
        for rec in recs_cb:
            st.write(rec)
    except KeyError:
        st.write("Book not found, check spelling.")
st.write("Example titles:", books['title'].head().to_list())



In [42]:
# --- Section 3: Collaborative Filtering (People who liked X also liked…) ---
st.header("Section 3: Collaborative Filtering (People who liked X also liked…)")



DeltaGenerator()

In [43]:
# only keep people who rated 3+ books and books that got 5+ ratings (so we can trust the data for recommendations)
num_ratings_per_user = data.groupby('user_id').size()   # count how many books each person rated (find who actually uses the system)
active_users = num_ratings_per_user[num_ratings_per_user >= 3].index   # keep people with at least 3 ratings for active guys
num_ratings_per_book = data.groupby('book_id').size()   # count how many ratings each book has most attention
popular_books = num_ratings_per_book[num_ratings_per_book >= 5].index  # keep books with at least 5 ratings (so we don’t recommend books nobody cared about)
filtered_data = data[data['user_id'].isin(active_users) & data['book_id'].isin(popular_books)]  # final dataset with reliable users + popular books (alot trustabe source)

In [44]:
# Create pivot table (to line up books vs users so we can compare ratings) since row=book, column=user and cell = rating 
bookmatrix = filtered_data.pivot_table(index='title', columns='user_id', values='rating').fillna(0)


In [45]:
# to compute similarity
similarity_scores = cosine_similarity(bookmatrix)

In [49]:
# Recommendation function: If you give me a book name, it will find in the ratings matrix,
#check which books have the most similar rating patterns, and then give you a list of the top 4 most similar books w authors.
#If the book isn’t in the data, I’ll just tell you no recommendations are available.
def recommend(book_name):
    try:
        index = np.where(bookmatrix.index == book_name)[0][0]
        similar_items = sorted(list(enumerate(similarity_scores[index])), key=lambda x: x[1], reverse=True)[1:5]
        data = []
        for i in similar_items:
            item = []
            temp_df = books[books['title'] == bookmatrix.index[i[0]]]
            item.extend(list(temp_df.drop_duplicates('title')['title'].values))
            item.extend(list(temp_df.drop_duplicates('title')['AUTHOR'].values))
            data.append(item)
        return data
    except:
        return "Book not found or no recommendations available."

In [50]:
# Streamlit UI for Collaborative Filtering
book_input_cf = st.text_input("Collaborative Filtering Book Title", "Harry Potter and the Sorcerer's Stone")
if st.button("Get Collaborative Recommendations"):
    recs_cf = recommend(book_input_cf)
    if isinstance(recs_cf, list):
        for rec in recs_cf:
            st.write(f"Recommended Book: {rec[0]}, Author: {rec[1]}")
    else:
        st.write(recs_cf)
st.write("Note: More ratings improve recommendations.")

