In [1]:
!pip install streamlit



In [2]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# ---------------------------
# Load Data
# ---------------------------
@st.cache_data
def load_data():
    df = pd.read_csv("books_clean_data.csv")
    return df

df = load_data()
st.title("📚 Book Recommendation System")

# ---------------------------
# Preprocess
# ---------------------------
# Only use Author, Categories, Publisher
features = ['Author', 'Categories', 'Publisher']

# Make sure columns exist
features = [f for f in features if f in df.columns]

# Fill missing values
for feature in features:
    df[feature] = df[feature].fillna('')

# Combine features into a single text column
df["combined_features"] = df.apply(lambda x: ' '.join([x[f] for f in features]), axis=1)


# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df["combined_features"])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

book_indices = pd.Series(df.index, index=df['Title']).drop_duplicates()

# ---------------------------
# Recommendation Function
# ---------------------------
def get_recommendations(title, cosine_sim=cosine_sim, df=df, indices=book_indices, top_n=5):
    if title not in indices:
        return []
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # skip self
    book_indices_list = [i[0] for i in sim_scores]
    return df.iloc[book_indices_list][["Title", "Author", "Categories", "Publisher"]]

# ---------------------------
# Metrics Functions
# ---------------------------
def precision_at_k(recommended, relevant, k=10):
    recommended_k = recommended[:k]
    hits = len(set(recommended_k) & set(relevant))
    return hits / k if k else 0

def recall_at_k(recommended, relevant, k=10):
    recommended_k = recommended[:k]
    hits = len(set(recommended_k) & set(relevant))
    return hits / len(relevant) if relevant else 0

def f1_at_k(recommended, relevant, k=10):
    p = precision_at_k(recommended, relevant, k)
    r = recall_at_k(recommended, relevant, k)
    return 2*p*r / (p+r) if (p+r) else 0

def average_precision(recommended, relevant, k=10):
    score, hits = 0.0, 0
    for i, item in enumerate(recommended[:k], start=1):
        if item in relevant:
            hits += 1
            score += hits / i
    return score / min(len(relevant), k) if relevant else 0

def dcg_at_k(recommended, relevant, k=10):
    dcg = 0.0
    for i, item in enumerate(recommended[:k], start=1):
        if item in relevant:
            dcg += 1 / np.log2(i+1)
    return dcg

def ndcg_at_k(recommended, relevant, k=10):
    dcg = dcg_at_k(recommended, relevant, k)
    idcg = sum(1 / np.log2(i+1) for i in range(1, min(len(relevant), k)+1))
    return dcg / idcg if idcg > 0 else 0

# ---------------------------
# Streamlit Tabs
# ---------------------------
tab1, tab2 = st.tabs(["📖 Recommendations", "📊 Data Insights"])

# ---------------------------
# Tab 1: Recommendations
# ---------------------------
with tab1:
    book_list = df["Title"].dropna().unique()
    selected_book = st.selectbox("📖 Select a book:", book_list)
    top_n = st.slider("How many recommendations?", 1, 10, 5)

    if selected_book:
        st.subheader(f"🔍 Recommendations for: *{selected_book}*")
        recommendations = get_recommendations(selected_book, top_n=top_n)
        st.table(recommendations)

        # Fake relevant items: top 3 from same category
        relevant_books = df[df["Categories"] == df.loc[book_indices[selected_book], "Categories"]]["Title"].head(3).tolist()
        recommended_books = recommendations["Title"].tolist()

        st.subheader("📊 Evaluation Metrics")
        st.write(f"**Precision@{top_n}:** {precision_at_k(recommended_books, relevant_books, top_n):.2f}")
        st.write(f"**Recall@{top_n}:** {recall_at_k(recommended_books, relevant_books, top_n):.2f}")
        st.write(f"**F1@{top_n}:** {f1_at_k(recommended_books, relevant_books, top_n):.2f}")
        st.write(f"**MAP@{top_n}:** {average_precision(recommended_books, relevant_books, top_n):.2f}")
        st.write(f"**NDCG@{top_n}:** {ndcg_at_k(recommended_books, relevant_books, top_n):.2f}")

# ---------------------------
# Tab 2: Data Insights
# ---------------------------
with tab2:
    st.subheader("📊 Top Authors, Publishers, Categories")

    col1, col2 = st.columns(2)

    with col1:
        st.write("**Top 10 Authors**")
        top_authors = df["Author"].value_counts().head(10)
        fig, ax = plt.subplots(figsize=(6,4))
        sns.barplot(y=top_authors.index, x=top_authors.values, palette="viridis", ax=ax)
        st.pyplot(fig)

    with col2:
        st.write("**Top 10 Publishers**")
        top_publishers = df["Publisher"].value_counts().head(10)
        fig, ax = plt.subplots(figsize=(6,4))
        sns.barplot(y=top_publishers.index, x=top_publishers.values, palette="plasma", ax=ax)
        st.pyplot(fig)

    st.write("**Top 10 Categories**")
    top_categories = df["Categories"].value_counts().head(10)
    fig, ax = plt.subplots(figsize=(8,5))
    sns.barplot(y=top_categories.index, x=top_categories.values, palette="cubehelix", ax=ax)
    st.pyplot(fig)

    if "Rating" in df.columns:
        st.write("**Ratings Distribution**")
        fig, ax = plt.subplots(figsize=(8,5))
        sns.histplot(df["Rating"].dropna(), bins=20, kde=True, ax=ax, color="skyblue")
        ax.set_title("Book Ratings Distribution")
        st.pyplot(fig)


2025-08-22 11:45:01.653 
  command:

    streamlit run /Users/abc/miniconda3/lib/python3.13/site-packages/ipykernel_launcher.py [ARGUMENTS]
2025-08-22 11:45:01.909 Session state does not function when running a script without `streamlit run`

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(y=top_authors.index, x=top_authors.values, palette="viridis", ax=ax)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(y=top_publishers.index, x=top_publishers.values, palette="plasma", ax=ax)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(y=top_categories.index, x=top_categories.values, palette="cubehelix", ax

In [None]:
!streamlit run Streamlit_book_app.py

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.178.107:8501[0m
[0m

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(y=top_authors.index, x=top_authors.values, palette="viridis", ax=ax)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(y=top_publishers.index, x=top_publishers.values, palette="plasma", ax=ax)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(y=top_categories.index, x=top_categories.values, palette="cubehelix", ax=ax)
