In [3]:
# Step 1: Data Understanding & Cleaning

import pandas as pd
import numpy as np

# -------------------------------
# Load Dataset
# -------------------------------
# Replace with your dataset path
df = pd.read_csv(r"C:\Users\urjam\Goodreads\GoodReads_100k_books.csv")

# Preview first few rows
df.head()


Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",https://i.gr-assets.com/images/S/compressed.ph...,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Betwee...,0,3.52,5,Between Two Fires: American Indians in the Civ...,33
1,"Charlotte Fiell,Emmanuelle Dirix",Paperback,Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.ph...,1906863482,9780000000000.0,https://goodreads.com/book/show/10010552-fashi...,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,Paperback,The seminal history and analysis of the Hungar...,"Politics,History",https://i.gr-assets.com/images/S/compressed.ph...,948984147,9780000000000.0,https://goodreads.com/book/show/1001077.Hungar...,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,Hardcover,"""All-American Anarchist"" chronicles the life a...","Labor,History",https://i.gr-assets.com/images/S/compressed.ph...,814327079,9780000000000.0,https://goodreads.com/book/show/1001079.All_Am...,324,3.83,1,All-American Anarchist: Joseph A. Labadie and ...,6
4,Jean Leveille,,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,https://i.gr-assets.com/images/S/compressed.ph...,2761920813,,https://goodreads.com/book/show/10010880-les-o...,177,4.0,1,Les oiseaux gourmands,1


In [4]:
# Shape of dataset
print("Shape:", df.shape)

# Columns and data types
print("\nInfo:")
print(df.info())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Quick stats
print("\nStats:")
print(df.describe(include="all"))


Shape: (100000, 13)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   author        100000 non-null  object 
 1   bookformat    96772 non-null   object 
 2   desc          93228 non-null   object 
 3   genre         89533 non-null   object 
 4   img           96955 non-null   object 
 5   isbn          85518 non-null   object 
 6   isbn13        88565 non-null   object 
 7   link          100000 non-null  object 
 8   pages         100000 non-null  int64  
 9   rating        100000 non-null  float64
 10  reviews       100000 non-null  int64  
 11  title         99999 non-null   object 
 12  totalratings  100000 non-null  int64  
dtypes: float64(1), int64(3), object(9)
memory usage: 9.9+ MB
None

Missing Values:
author              0
bookformat       3228
desc             6772
genre           10467
img              3045
isbn     

In [26]:
# Drop columns not needed for recommendations
drop_cols = ["link"]
df = df.drop(columns=drop_cols, errors="ignore")

print("Remaining Columns:", df.columns.tolist())


Remaining Columns: ['author', 'bookformat', 'desc', 'genre', 'isbn', 'isbn13', 'pages', 'rating', 'reviews', 'title', 'totalratings', 'rating_norm', 'reviews_norm', 'totalratings_norm', 'pages_norm']


In [27]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Normalize ratings, reviews, totalratings, pages
for col in ["rating", "reviews", "totalratings", "pages"]:
    if col in df.columns:
        df[col] = df[col].fillna(0)  # handle missing numeric values
        df[col + "_norm"] = scaler.fit_transform(df[[col]])

df.head()


Unnamed: 0,author,bookformat,desc,genre,isbn,isbn13,pages,rating,reviews,title,totalratings,rating_norm,reviews_norm,totalratings_norm,pages_norm
0,Laurence M. Hauptman,Hardcover,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",002914180X,9780000000000.0,0,3.52,5,Between Two Fires: American Indians in the Civ...,33,0.704,0.002764,0.001738,0.0
1,Jean Leveille,,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",Unknown,2761920813,,177,4.0,1,Les oiseaux gourmands,1,0.8,0.000553,5.3e-05,0.148615
2,Umberto Eco,Hardcover,In the course of the long debate on the nature...,History,9027232938,9790000000000.0,0,4.1,0,On The Medieval Theory Of Signs,10,0.82,0.0,0.000527,0.0
3,John C. Maxwell,Hardcover,"In the spirit of his best-selling ,The 21 Irre...","Leadership,Business,Self Help,Nonfiction,Chris...",785274324,20049074329.0,369,4.05,51,The 21 Most Powerful Minutes in a Leader's Day...,612,0.81,0.028192,0.032229,0.309824
4,Nicene creed,Nook,Unknown,Christian,,2940000000000.0,0,3.29,0,The creeds of Nicea; Constantinople; Athanasiu...,7,0.658,0.0,0.000369,0.0


In [28]:
# Fill missing text fields with placeholder
text_cols = ["author", "desc", "genre", "title"]
for col in text_cols:
    if col in df.columns:
        df[col] = df[col].fillna("Unknown")

# Check again
df.isnull().sum()


author                 0
bookformat            15
desc                   0
genre                  0
isbn                 100
isbn13                 1
pages                  0
rating                 0
reviews                0
title                  0
totalratings           0
rating_norm            0
reviews_norm           0
totalratings_norm      0
pages_norm             0
dtype: int64

In [29]:
# Prefer isbn13 over isbn if both exist
if "isbn13" in df.columns:
    df = df.drop_duplicates(subset="isbn13", keep="first")
elif "isbn" in df.columns:
    df = df.drop_duplicates(subset="isbn", keep="first")

print("After Deduplication:", df.shape)


After Deduplication: (213, 15)


In [30]:
df.to_csv("books_dataset_cleaned.csv", index=False)
print("✅ Cleaned dataset saved as books_dataset_cleaned.csv")

✅ Cleaned dataset saved as books_dataset_cleaned.csv


In [31]:
# Step 2: Feature Engineering

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from scipy.sparse import hstack

# Load the cleaned dataset
df = pd.read_csv("books_dataset_cleaned.csv")

# -------------------------------
# TF-IDF on Book Descriptions
# -------------------------------
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)  # limit features for speed
desc_matrix = tfidf.fit_transform(df["desc"])

print("Description Matrix Shape:", desc_matrix.shape)

# -------------------------------
# Genre Vectorization
# -------------------------------
count_vect = CountVectorizer(token_pattern=r"[^,]+")  # handle genres separated by commas
genre_matrix = count_vect.fit_transform(df["genre"].astype(str))

print("Genre Matrix Shape:", genre_matrix.shape)

# -------------------------------
# One-Hot Encode Author
# -------------------------------
ohe = OneHotEncoder(handle_unknown="ignore")
author_matrix = ohe.fit_transform(df[["author"]])

print("Author Matrix Shape:", author_matrix.shape)

# -------------------------------
# Combine All Features
# -------------------------------
# We also add normalized numeric features if available
num_features = ["rating_norm", "reviews_norm", "totalratings_norm", "pages_norm"]
num_matrix = df[num_features].values

# Final feature matrix
from scipy.sparse import csr_matrix
import numpy as np

final_matrix = hstack([desc_matrix, genre_matrix, author_matrix, csr_matrix(num_matrix)])

print("Final Matrix Shape:", final_matrix.shape)

Description Matrix Shape: (213, 5000)
Genre Matrix Shape: (213, 314)
Author Matrix Shape: (213, 205)
Final Matrix Shape: (213, 5523)


In [32]:
import joblib

# Save vectorizers and encoders
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
joblib.dump(count_vect, "genre_vectorizer.pkl")
joblib.dump(ohe, "author_encoder.pkl")

print("✅ Vectorizers saved for reuse")


✅ Vectorizers saved for reuse


In [33]:
import joblib

# Save final_matrix and dataframe
joblib.dump(final_matrix, "final_matrix.pkl")
df.to_csv("books_dataset_cleaned.csv", index=False)

print("✅ Saved final_matrix.pkl and cleaned dataset")


✅ Saved final_matrix.pkl and cleaned dataset


In [34]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from scipy.sparse import hstack, csr_matrix
import joblib

# Load cleaned dataset
df = pd.read_csv("books_dataset_cleaned.csv")

# -------------------------------
# TF-IDF on Book Descriptions
# -------------------------------
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
desc_matrix = tfidf.fit_transform(df["desc"].astype(str))

# -------------------------------
# Genre Vectorization
# -------------------------------
count_vect = CountVectorizer(token_pattern=r"[^,]+")
genre_matrix = count_vect.fit_transform(df["genre"].astype(str))

# -------------------------------
# One-Hot Encode Author
# -------------------------------
ohe = OneHotEncoder(handle_unknown="ignore")
author_matrix = ohe.fit_transform(df[["author"]])

# -------------------------------
# Numeric Features
# -------------------------------
scaler = MinMaxScaler()
num_features = ["rating", "reviews", "totalratings", "pages"]
for col in num_features:
    df[col+"_norm"] = scaler.fit_transform(df[[col]].fillna(0))

num_matrix = df[[c+"_norm" for c in num_features]].values

# -------------------------------
# Combine features
# -------------------------------
final_matrix = hstack([desc_matrix, genre_matrix, author_matrix, csr_matrix(num_matrix)])

# -------------------------------
# Save matrix and vectorizers
# -------------------------------
joblib.dump(final_matrix, "final_matrix.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
joblib.dump(count_vect, "genre_vectorizer.pkl")
joblib.dump(ohe, "author_encoder.pkl")

print("✅ final_matrix saved. Shape:", final_matrix.shape)


✅ final_matrix saved. Shape: (213, 5523)


In [35]:
import pandas as pd
import joblib
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset and feature matrix
df = pd.read_csv("books_dataset_cleaned.csv")
final_matrix = joblib.load("final_matrix.pkl")

print("Feature matrix loaded:", final_matrix.shape)

# Compute cosine similarity
similarity_matrix = cosine_similarity(final_matrix, dense_output=True)
print("✅ Cosine similarity matrix computed:", similarity_matrix.shape)


Feature matrix loaded: (213, 5523)
✅ Cosine similarity matrix computed: (213, 213)


In [36]:
def recommend_books(title, n=5):
    """
    Recommend top n books similar to the given title using cosine similarity.
    Uses fuzzy matching.
    """
    best_match = process.extractOne(title, df['title'], score_cutoff=60)
    if not best_match:
        return pd.DataFrame(), f"No book found matching '{title}'"
    
    matched_title = best_match[0]
    idx = df[df['title'] == matched_title].index[0]

    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]
    book_indices = [i[0] for i in sim_scores]

    # Remove 'img' since it's not in the dataset
    recommendations = df.iloc[book_indices][['title','author','genre','rating']].reset_index(drop=True)
    return recommendations, matched_title


In [37]:
recommend_books("Hobbit", n=5)

(                       title           author  \
 0               La Å£igÄƒnci    Mircea Eliade   
 1              The Red Horse    Eugenio Corti   
 2           DEX-ul ÅŸi sexul  Radu Pavel Gheo   
 3             Eonul dogmatic     Lucian Blaga   
 4  Ð Ð°Ð·ÐºÐ°Ð·Ð¸ - Ñ‚Ð¾Ð¼ 2    Yordan Yovkov   
 
                                                genre  rating  
 0  European Literature,Romanian Literature,Classi...    3.94  
 1  Historical,Historical Fiction,Fiction,War,Worl...    4.22  
 2  European Literature,Romanian Literature,Nonfic...    4.15  
 3  Philosophy,European Literature,Romanian Litera...    4.24  
 4  Classics,European Literature,Bulgarian Literature    4.49  ,
 'Hotarul Nestatornic')

In [38]:
pip install rapidfuzz

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [41]:
def recommend_books(title, n=5):
    """
    Recommend top n books similar to the given title using cosine similarity.
    Uses fuzzy matching.
    """
    best_match = process.extractOne(title, df['title'], score_cutoff=60)
    if not best_match:
        return pd.DataFrame(), f"No book found matching '{title}'"
    
    matched_title = best_match[0]
    idx = df[df['title'] == matched_title].index[0]

    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]
    book_indices = [i[0] for i in sim_scores]

    # Remove 'img' since it's not in the dataset
    recommendations = df.iloc[book_indices][['title','author','genre','rating']].reset_index(drop=True)
    return recommendations, matched_title


In [42]:
recommend_books("Hobbit", n=5)

(                       title           author  \
 0               La Å£igÄƒnci    Mircea Eliade   
 1              The Red Horse    Eugenio Corti   
 2           DEX-ul ÅŸi sexul  Radu Pavel Gheo   
 3             Eonul dogmatic     Lucian Blaga   
 4  Ð Ð°Ð·ÐºÐ°Ð·Ð¸ - Ñ‚Ð¾Ð¼ 2    Yordan Yovkov   
 
                                                genre  rating  
 0  European Literature,Romanian Literature,Classi...    3.94  
 1  Historical,Historical Fiction,Fiction,War,Worl...    4.22  
 2  European Literature,Romanian Literature,Nonfic...    4.15  
 3  Philosophy,European Literature,Romanian Litera...    4.24  
 4  Classics,European Literature,Bulgarian Literature    4.49  ,
 'Hotarul Nestatornic')