*source: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews*

In [34]:
import pandas as pd

In [35]:
import sys
project_root = "/Users/sayalimoghe/Documents/Career/GitHub/conversational-book-recommendation-agent"
if project_root not in sys.path:
    sys.path.append(project_root)

%run /Users/sayalimoghe/Documents/Career/GitHub/conversational-book-recommendation-agent/common/constants.py
%run /Users/sayalimoghe/Documents/Career/GitHub/conversational-book-recommendation-agent/common/helpers.py
%run /Users/sayalimoghe/Documents/Career/GitHub/conversational-book-recommendation-agent/common/utils.py

# stage 1 - data preprocessing

In [39]:
%run /Users/sayalimoghe/Documents/Career/GitHub/conversational-book-recommendation-agent/ml_pipeline/stages/stage_1_data_preprocessing.py

## books_df

In [40]:
original_books_df = safe_read_csv(PATHS["books"], DATA_PREPROCESSING["input_cols_books"])
books_df = clean_books_data(original_books_df)

In [41]:
print(original_books_df.columns)
print(books_df.columns)
books_df.head()

Index(['title', 'description', 'authors', 'infolink', 'categories', 'genres'], dtype='object')
Index(['title', 'description', 'authors', 'infolink', 'categories', 'genres',
       'book_id'],
      dtype='object')


Unnamed: 0,title,description,authors,infolink,categories,genres,book_id
0,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,[Philip Nel],http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],[biography and autobiography],1
1,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,[David R. Ray],http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],[religion],2
2,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,[Veronica Haddon],http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],[fiction],3
3,The Church of Christ: A Biblical Ecclesiology ...,In The Church of Christ: A Biblical Ecclesiolo...,[Everett Ferguson],http://books.google.nl/books?id=kVqRaiPlx88C&d...,['Religion'],[religion],4
4,Saint Hyacinth of Poland,The story for children 10 and up of St. Hyacin...,[Mary Fabyan Windeatt],http://books.google.nl/books?id=lmLqAAAACAAJ&d...,['Biography & Autobiography'],[biography and autobiography],5


In [43]:
from difflib import SequenceMatcher

def view_fuzzy_title_candidates(
    df: pd.DataFrame,
    similarity_threshold: float = 0.95
) -> pd.DataFrame:
    """
    View fuzzy title similarity candidates.
    NO merging. Inspection only.
    """
    df = df.copy()

    df["_base_title"] = (
        df["title"]
        .str.lower()
        .str.replace(r"[^a-z0-9\s]", "", regex=True)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )
    df["_authors_key"] = df["authors"].apply(tuple)

    candidates = []

    for authors_key, group in df.groupby("_authors_key"):
        titles = group["_base_title"].tolist()
        indices = group.index.tolist()

        for i in range(len(titles)):
            for j in range(i + 1, len(titles)):
                score = SequenceMatcher(None, titles[i], titles[j]).ratio()
                if score >= similarity_threshold:
                    candidates.append({
                        "title_1": group.loc[indices[i], "title"],
                        "title_2": group.loc[indices[j], "title"],
                        "authors": authors_key,
                        "similarity": score
                    })

    return pd.DataFrame(candidates).sort_values(
        by="similarity", ascending=False
    )

similar_books = view_fuzzy_title_candidates(books_df)

In [33]:
similar_books

Unnamed: 0,title,description,authors,infolink,categories,genres,book_id,_base_title,_authors_key


Unnamed: 0,title,description,authors,infolink,categories,genres,book_id,_title_key,_authors_key


Unnamed: 0,title,description,authors,infolink,categories,genres,book_id,title_author,authors_clean
22,Hunting The Hard Way,"Thrilling stories about hunting wildcat, buffa...",[Howard Hill],https://play.google.com/store/books/details?id...,['Sports & Recreation'],[sports and recreation],19,hunting the hard way howard hill,howard hill
121,Lincoln reconsidered;: Essays on the Civil War...,A “brilliant” look at America’s sixteenth pres...,[David Herbert Donald],https://play.google.com/store/books/details?id...,['History'],[history],87,lincoln reconsidered essays on the civil war e...,david herbert donald
122,Economics in one lesson,"With over a million copies sold, Economics in ...",[Henry Hazlitt],https://play.google.com/store/books/details?id...,['Business & Economics'],[business and economics],88,economics in one lesson henry hazlitt,henry hazlitt
150,Hypatia or New Foes with an Old Face,This book is part of the TREDITION CLASSICS se...,[Charles Kingsley],http://books.google.nl/books?id=wX9EtQAACAAJ&d...,,[nan],107,hypatia or new foes with an old face charles k...,charles kingsley
308,The Handsome road,A saga of Louisiana by an author who “belongs ...,[Gwen Bristow],https://play.google.com/store/books/details?id...,['Fiction'],[fiction],217,the handsome road gwen bristow,gwen bristow
...,...,...,...,...,...,...,...,...,...
211977,Basic Statistics: a Primer for the Biomedical ...,New Edition of a Classic Guide to Statistical ...,"[Olive Jean Dunn, Virginia A. Clark]",http://books.google.com/books?id=q_k2pKmjWq8C&...,['Mathematics'],[mathematics],143611,basic statistics a primer for the biomedical s...,olive jean dunn virginia a. clark
212056,The Little Mermaid,The beloved tale of The Little Mermaid is fait...,[Hans Christian Anderson],http://books.google.com/books?id=FOedDwAAQBAJ&...,['Juvenile Fiction'],[juvenile fiction],143648,the little mermaid hans christian anderson,hans christian anderson
212153,"Penguin island,","Mael, a scion of a royal family of Cambria, wa...",[Anatole France],https://play.google.com/store/books/details?id...,,[nan],143691,penguin island anatole france,anatole france
212315,Hailstones and Halibut Bones (Adventures in Co...,A book of verse about the colors of the spectrum,[Mary O'Neill],http://books.google.com/books?id=4wezE7jHZlsC&...,['Juvenile Nonfiction'],[juvenile nonfiction],143770,hailstones and halibut bones adventures in col...,mary o'neill


In [8]:
books_df[books_df['title_author'].str.lower().str.contains('catch 22', na=False)]

Unnamed: 0,title,description,authors,infolink,categories,genres,book_id,title_author,authors_clean
11775,Catch 22,Presents the contemporary classic depicting th...,[Joseph Heller],http://books.google.nl/books?id=Xfze51E7TEoC&d...,['Fiction'],[fiction],8770,catch 22 joseph heller,joseph heller
64897,Catch 22 (catch-22),Presents the contemporary classic depicting th...,[Joseph Heller],http://books.google.com/books?id=Xfze51E7TEoC&...,['Fiction'],[fiction],47477,catch 22 catch22 joseph heller,joseph heller


## ratings_df

In [48]:
ratings_df = pd.read_csv(PATHS["ratings"])

In [52]:
ratings_df.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [54]:
users_df = (
    ratings_df[["User_id", "profileName"]]
    .drop_duplicates()
    .rename(columns={
        "User_id": "user_id",
        "profileName": "profile_name"
    })
    .reset_index(drop=True)
)

In [58]:
print(users_df.shape)
users_df.head(10)

(1009377, 2)


Unnamed: 0,user_id,profile_name
0,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz"""
1,A30TK6U7DNS82R,Kevin Killian
2,A3UH4UZ4RSVO82,John Granger
3,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher"""
4,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore"""
5,A2F6NONFUDB6UK,Malvin
6,A14OJS0VWMOSWO,Midwest Book Review
7,A2RSSXTDZDUSH4,J. Squire
8,A25MD5I2GUIW6W,"J. P. HIGBED ""big fellow"""
9,A3VA4XFS5WNJO3,Donald Burnside
