In [None]:
# 📚 Basic Libraries
import numpy as np 
import pandas as pd
import warnings

# 📊 Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

# 🤖 Machine Learning
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from scipy.sparse import hstack
from sklearn.metrics import silhouette_score

In [None]:
df= pd.read_csv("combined_data.csv")

In [None]:
df

### Clean and preprocess the data

In [None]:
# Keep only rows with title and description
df_cleaned = df.dropna(subset=["title", "description"]).copy()

In [None]:
# Fill missing values in 'authors', 'category' with 'Unknown'
df_cleaned["authors"] = df_cleaned["authors"].fillna("Unknown")
df_cleaned["category"] = df_cleaned["category"].fillna("Unknown")

# Fill missing averageRating and ratingsCount with 0 (if not available, assume unrated)
df_cleaned["averageRating"] = df_cleaned["averageRating"].fillna(0)
df_cleaned["ratingsCount"] = df_cleaned["ratingsCount"].fillna(0)

# Ensure numeric columns are correctly typed
df_cleaned["averageRating"] = pd.to_numeric(df_cleaned["averageRating"], errors="coerce")
df_cleaned["ratingsCount"] = pd.to_numeric(df_cleaned["ratingsCount"], errors="coerce")

In [None]:
# Show how many entries remain and some basic stats
df_cleaned.info(), df_cleaned.describe(include="all").T

### Feature Engineering for Clustering

In [None]:
# Convert 'description' to TF-IDF vectors (limit vocab for speed)
# books with similar themes should be grouped
# converts all book descriptions into numerical vectors
tfidf = TfidfVectorizer(stop_words="english", max_features=200)
tfidf_matrix = tfidf.fit_transform(df_cleaned["description"])

In [None]:
# Normalize numerical features (rating and ratingsCount)
scaler = StandardScaler()
numeric_features = df_cleaned[["averageRating", "ratingsCount"]]
numeric_scaled = scaler.fit_transform(numeric_features)

In [None]:
# Combine text features with numeric ones
combined_features = hstack([tfidf_matrix, numeric_scaled])

In [None]:
# Apply KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)
df_cleaned["cluster"] = kmeans.fit_predict(combined_features)

In [None]:
# Show how many books in each cluster
df_cleaned["cluster"].value_counts().sort_index()

In [None]:
df_cleaned.head()

### Elbow method for optimal k

In [None]:
inertias = []
range_of_clusters = range(1, 11)

for k in range_of_clusters:
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(combined_features)
    inertias.append(model.inertia_)

inertias

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range_of_clusters), y=inertias, mode='lines+markers', name='Inertia'))
fig.update_layout(title='Elbow Method For Optimal k',
                  xaxis_title='Number of clusters, k',
                  yaxis_title='Inertia',
                  xaxis=dict(tickmode='array', tickvals=list(range_of_clusters)))
fig.show()

### Silhouette Score

In [None]:
silhouette_scores = []
for k in range_of_clusters:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(combined_features)
    
    # Only calculate score if more than 1 cluster
    if len(set(labels)) > 1:
        score = silhouette_score(combined_features, labels)
    else:
        score = 0  # or use None if you prefer

    silhouette_scores.append(score)

In [None]:
silhouette_scores

In [None]:
plt.subplot(1, 2, 2)
plt.plot(range_of_clusters, silhouette_scores, marker='o', color='green')
plt.title('Silhouette Score vs. k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette Score')

### Set k = 4

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42)
df_cleaned["cluster"] = kmeans.fit_predict(combined_features)

In [None]:
df_cleaned

In [None]:
df_cleaned["cluster"].value_counts().sort_index()

### Book recommender

In [None]:
def recommend_similar_books_kmeans(book_index, df, cluster_col="cluster", n=10):
    cluster_label = df.loc[book_index, cluster_col]
    similar_books = df[(df[cluster_col] == cluster_label) & (df.index != book_index)]
    return similar_books.sample(min(n, len(similar_books)), random_state=42)

In [None]:
book_index = 10
recommended_books_kmeans = recommend_similar_books_kmeans(book_index, df_cleaned)
recommended_books_kmeans[["title", "authors", "category", "description", "averageRating", "ratingsCount"]]