In [1]:
from datasets import load_dataset

dataset = load_dataset(
    "cogsci13/Amazon-Reviews-2023-Books-Meta",
    split="full[:100000]"
)

print(dataset)
print(dataset[0])


Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Dataset({
    features: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'subtitle', 'author'],
    num_rows: 100000
})
{'main_category': 'Books', 'title': 'Chaucer', 'average_rating': 4.5, 'rating_number': 29, 'features': [], 'description': [], 'price': '8.23', 'images': {'hi_res': [None], 'large': ['https://m.media-amazon.com/images/I/41X61VPJYKL._SX334_BO1,204,203,200_.jpg'], 'thumb': [None], 'variant': ['MAIN']}, 'videos': {'title': [], 'url': [], 'user_id': []}, 'store': 'Peter Ackroyd (Author)', 'categories': ['Books', 'Literature & Fiction', 'History & Criticism'], 'details': '{"Publisher": "Chatto & Windus; First Edition (January 1, 2004)", "Language": "English", "Hardcover": "196 pages", "ISBN 10": "0701169850", "ISBN 13": "978-0701169855", "Item Weight": "10.1 ounces", "Dimensions": "5.39 x 0.71 x 7.48 inches"}', 'parent_asin': '0701169850',

In [2]:
import pandas as pd
df = dataset.to_pandas()
df.head(1)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Books,Chaucer,4.5,29,[],[],8.23,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",Peter Ackroyd (Author),"[Books, Literature & Fiction, History & Critic...","{""Publisher"": ""Chatto & Windus; First Edition ...",701169850,,"Hardcover – Import, January 1, 2004",{'avatar': 'https://m.media-amazon.com/images/...


In [3]:
import numpy as np

def clean_features(x):
    if isinstance(x, np.ndarray):
        return " ".join([str(i) for i in x if i])
    return ""   

In [4]:
df["cleaned_features"] = df["features"].apply(clean_features)


In [5]:

def clean_description(x):
    if isinstance(x, np.ndarray):
        return " ".join([str(i) for i in x if i])
    return ""  


In [6]:
df["cleaned_description"] = df["description"].apply(clean_description)


In [7]:
import numpy as np
import pandas as pd

def parse_price(x):
    if pd.isna(x):
        return np.nan
    x = str(x).strip().lower()
    
    if x in ["0", "0.0", "0.00", "free"]:
        return 0.0
    
    x = x.replace("$", "").replace("£", "").replace("€", "").replace(",", "")
    try:
        return float(x)
    except:
        return np.nan

df["price_temp_raw"] = df["price"].apply(parse_price)

df["is_free"] = (df["price_temp_raw"] == 0).astype(int)
df["is_missing"] = df["price_temp_raw"].isna().astype(int)

median_books = df.loc[
    (df["main_category"] == "Books") &
    (df["price_temp_raw"].between(1, 120, inclusive="both")),
    "price_temp_raw"
].median()

df["price_temp_clean"] = df["price_temp_raw"]

mask_kindle_audi = df["price_temp_clean"].isna() & df["main_category"].isin(["Buy a Kindle", "Audible Audiobooks"])
df.loc[mask_kindle_audi, "price_temp_clean"] = 0

mask_books_nan = df["price_temp_clean"].isna() & (df["main_category"] == "Books")
df.loc[mask_books_nan, "price_temp_clean"] = median_books

mask_books_outlier = (df["main_category"] == "Books") & (df["price_temp_raw"] > 120)
df.loc[mask_books_outlier, "price_temp_clean"] = 120

df["is_outlier"] = 0
df.loc[mask_books_outlier, "is_outlier"] = 1

print("Tổng số dòng:", len(df))
print("Số free:", df["is_free"].sum())
print("Số missing:", df["is_missing"].sum())
print("Median Books (1–120):", median_books)
print("Số outlier Books (>120):", df["is_outlier"].sum())

cols = ["title","main_category","price",
        "price_temp_raw","price_temp_clean",
        "is_free","is_missing","is_outlier"]
pd.set_option("display.max_colwidth", None)
print(df[cols].head(20))


Tổng số dòng: 100000
Số free: 2691
Số missing: 6756
Median Books (1–120): 14.18
Số outlier Books (>120): 1416
                                                                                                                                                                                                     title  \
0                                                                                                                                                                                                  Chaucer   
1                                                                                                                                                                                  Notes from a Kidwatcher   
2                                                                                                                                                                              Service: A Navy SEAL at War   
3                                                                 

  return op(a, b)
  return op(a, b)
  return op(a, b)


In [8]:
import pandas as pd

def extract_main_image(df, src_col="images", dst_col="main_images"):
    """
    Trích xuất ảnh chính (main_image) từ cột images.
    - Nếu là dict và có key "large" -> lấy phần tử đầu tiên.
    - Ngược lại -> None.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame chứa dữ liệu
    src_col : str
        Tên cột chứa images (mặc định "images")
    dst_col : str
        Tên cột mới để lưu link ảnh chính (mặc định "main_images")
    
    Returns
    -------
    df : pd.DataFrame
        DataFrame có thêm cột dst_col
    """
    df[dst_col] = df[src_col].apply(
        lambda x: x["large"][0] if isinstance(x, dict) and "large" in x and len(x["large"]) > 0 else None
    )
    return df

df = extract_main_image(df, src_col="images", dst_col="main_images")

pd.set_option("display.max_colwidth", None)
print(df[["main_images"]].head(20))


                                                                                main_images
0               https://m.media-amazon.com/images/I/41X61VPJYKL._SX334_BO1,204,203,200_.jpg
1   https://m.media-amazon.com/images/I/41bfTRxpMML._SX218_BO1,204,203,200_QL40_FMwebp_.jpg
2               https://m.media-amazon.com/images/I/41YQHDWRyGL._SX321_BO1,204,203,200_.jpg
3               https://m.media-amazon.com/images/I/614Mx0QCe7L._SX339_BO1,204,203,200_.jpg
4                                       https://m.media-amazon.com/images/I/41j6GpAqFBL.jpg
5               https://m.media-amazon.com/images/I/417CItk7HML._SX387_BO1,204,203,200_.jpg
6               https://m.media-amazon.com/images/I/41b2UvkHaAL._SX331_BO1,204,203,200_.jpg
7   https://m.media-amazon.com/images/I/51j48HH1P9L._SX218_BO1,204,203,200_QL40_FMwebp_.jpg
8               https://m.media-amazon.com/images/I/313xN7wqDQL._SX331_BO1,204,203,200_.jpg
9   https://m.media-amazon.com/images/I/415r5RJ7alL._SY291_BO1,204,203,200_QL40_

In [9]:
import numpy as np
import pandas as pd

def normalize_categories(df, src_col="categories", dst_col="categories"):
    
    df[dst_col] = df[src_col].apply(
        lambda x: " > ".join(x.tolist() if isinstance(x, np.ndarray) else x)
        if isinstance(x, (list, np.ndarray)) else str(x)
    )

    return df


df = normalize_categories(df)

print(df[["categories"]].head(10).to_string(index=False))
print("Type của phần tử đầu tiên:", type(df["categories"].iloc[0]))


                                                 categories
         Books > Literature & Fiction > History & Criticism
              Books > Reference > Words, Language & Grammar
   Books > Biographies & Memoirs > Leaders & Notable People
       Books > Children's Books > Science Fiction & Fantasy
Books > Mystery, Thriller & Suspense > Thrillers & Suspense
           Books > Arts & Photography > History & Criticism
              Books > Parenting & Relationships > Parenting
         Books > Engineering & Transportation > Engineering
               Books > Literature & Fiction > Genre Fiction
          Books > Education & Teaching > Schools & Teaching
Type của phần tử đầu tiên: <class 'str'>


In [10]:
import ast
import pandas as pd

def normalize_details(df, src_col="details"):
    """
    Chuẩn hóa cột details:
    - Giữ nguyên string ở cột details_text (dùng cho embed)
    - Parse sang dict ở cột details_dict
    - Tách một số field cơ bản thành cột riêng
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame chứa dữ liệu
    src_col : str
        Tên cột gốc (mặc định: "details")
    
    Returns
    -------
    df : pd.DataFrame
        DataFrame có thêm các cột:
        - details_text
        - details_dict
        - publisher, language, isbn_10, isbn_13, page_count
    """
    
    df["details_text"] = df[src_col]
    
    # Parse sang dict an toàn
    def safe_parse(x):
        if isinstance(x, str):
            try:
                return ast.literal_eval(x)
            except:
                return None
        return None
    
    df["details_dict"] = df[src_col].apply(safe_parse)
    
    def extract_field(d, key):
        if isinstance(d, dict):
            return d.get(key)
        return None
    
    df["publisher"]  = df["details_dict"].apply(lambda d: extract_field(d, "Publisher"))
    df["language"]   = df["details_dict"].apply(lambda d: extract_field(d, "Language"))
    df["isbn_10"]    = df["details_dict"].apply(lambda d: extract_field(d, "ISBN 10"))
    df["isbn_13"]    = df["details_dict"].apply(lambda d: extract_field(d, "ISBN 13"))
    df["page_count"] = df["details_dict"].apply(
        lambda d: d.get("Hardcover") or d.get("Paperback") if isinstance(d, dict) else None
    )
    
    return df


df = normalize_details(df)

cols = ["publisher", "language", "isbn_10", "isbn_13", "page_count"]
pd.set_option("display.max_colwidth", None)
print(df[cols].head(20).to_string(index=False))


                                                    publisher language       isbn_10        isbn_13 page_count
             Chatto & Windus; First Edition (January 1, 2004)  English    0701169850 978-0701169855  196 pages
                      Heinemann; First Edition (May 20, 1996)  English    0435088688 978-0435088682  316 pages
         Little, Brown and Company; 1st edition (May 8, 2012)  English 9780316185363 978-0316185363  384 pages
    Scholastic Paperbacks; Reprint edition (October 29, 2013)  English    0545425573 978-0545425575   64 pages
                                                         None  English          None           None       None
                  Independently published (December 30, 2021)  English          None 979-8528537702   24 pages
Giulford ,2004. 2nd Edition; 8082nd edition (January 1, 1994)     None          None           None       None
        Make Community, LLC; 2nd edition (September 22, 2015)  English 9781680450262 978-1680450262  352 pages
 

In [11]:
df = df.drop(columns=["bought_together", "video", "store"], errors="ignore")
print("Các cột còn lại:", df.columns.tolist())


Các cột còn lại: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'categories', 'details', 'parent_asin', 'subtitle', 'author', 'cleaned_features', 'cleaned_description', 'price_temp_raw', 'is_free', 'is_missing', 'price_temp_clean', 'is_outlier', 'main_images', 'details_text', 'details_dict', 'publisher', 'language', 'isbn_10', 'isbn_13', 'page_count']


In [12]:
import ast

def extract_author_info(df, author_col="author"):
    # giữ nguyên dict, chỉ parse khi là string
    def safe_parse(x):
        if isinstance(x, dict):
            return x
        if isinstance(x, str) and x.strip():
            try:
                return ast.literal_eval(x)
            except Exception:
                return {}
        return {}

    df = df.copy()
    df["author_dict"] = df[author_col].apply(safe_parse)

    df["author_name"] = df["author_dict"].apply(
        lambda x: x.get("name") if isinstance(x, dict) else None
    )
    df["author_about"] = df["author_dict"].apply(
        lambda x: " ".join(x.get("about", [])) if isinstance(x, dict) else ""
    )
    df["author_avatar"] = df["author_dict"].apply(
        lambda x: x.get("avatar") if isinstance(x, dict) else None
    )

    return df

df = extract_author_info(df, author_col="author")

print(df[["author", "author_name", "author_about", "author_avatar"]].head(10))


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [13]:
def build_unique_id(df):
    df = df.copy()

    
    df["unique_id"] = df["isbn_13"].fillna(df["parent_asin"])

   
    df["isbn_10"] = df["isbn_10"].fillna("Unknown")
    df["isbn_13"] = df["isbn_13"].fillna("Unknown")

   
    num_dupes = df["unique_id"].duplicated().sum()
    if num_dupes > 0:
        print(f" Cảnh báo: Có {num_dupes} dòng bị trùng unique_id")
        dupes = df[df["unique_id"].duplicated(keep=False)].sort_values("unique_id")
        print(dupes[["unique_id", "title", "author_name"]].head(20))
    else:
        print(" Không có unique_id nào bị trùng")

    return df

df = build_unique_id(df)
print(df[["isbn_10", "isbn_13", "parent_asin", "unique_id"]].head(10))


 Cảnh báo: Có 9 dòng bị trùng unique_id
            unique_id  \
79164  978-0061092626   
35662  978-0061092626   
7006   978-0061130359   
69361  978-0061130359   
58529  978-0061253843   
96261  978-0061253843   
48108  978-0061456572   
71992  978-0061456572   
29266  978-0132843478   
57112  978-0132843478   
58875  978-0253008527   
45529  978-0253008527   
56233  978-0312305062   
34247  978-0312305062   
22187  978-0802145710   
15545  978-0802145710   
34747  978-1563113314   
24547  978-1563113314   

                                                                                         title  \
79164                                                                      The Mysterious West   
35662                                                                      The Mysterious West   
7006                                                                              Sacred Games   
69361                                                                      Sacred Games (P.S

In [14]:
import pandas as pd

def clean_page_count(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "page_count" in df.columns:
        df["page_count"] = (
            df["page_count"]
            .astype(str)                 
            .str.extract(r"(\d+)")      
            .astype(float)               
            .fillna(-1)                  
            .astype(int)                
        )
    return df

df = clean_page_count(df)
print(df["page_count"].head(10))
print(df["page_count"].dtype)


0    196
1    316
2    384
3     64
4     -1
5     24
6     -1
7    352
8    367
9     51
Name: page_count, dtype: int64
int64


In [15]:
import pandas as pd

def clean_categorical(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for col in ["main_category", "publisher", "language"]:
        if col in df.columns:
            df[col] = (
                df[col]
                .fillna("Unknown")   
                .astype(str)        
                .str.strip()        
            )
    return df

df = clean_categorical(df)
print(df[["main_category", "publisher", "language"]].head(10))


  main_category  \
0         Books   
1         Books   
2         Books   
3         Books   
4  Buy a Kindle   
5         Books   
6         Books   
7         Books   
8         Books   
9         Books   

                                                       publisher language  
0               Chatto & Windus; First Edition (January 1, 2004)  English  
1                        Heinemann; First Edition (May 20, 1996)  English  
2           Little, Brown and Company; 1st edition (May 8, 2012)  English  
3      Scholastic Paperbacks; Reprint edition (October 29, 2013)  English  
4                                                        Unknown  English  
5                    Independently published (December 30, 2021)  English  
6  Giulford ,2004. 2nd Edition; 8082nd edition (January 1, 1994)  Unknown  
7          Make Community, LLC; 2nd edition (September 22, 2015)  English  
8                   Independently published (September 25, 2019)  English  
9            WallBuilder Press

In [16]:
df["main_images"] = df["main_images"].fillna("Unknown").astype(str)


In [17]:
df["subtitle"] = df["subtitle"].fillna("Unknown").astype(str)


In [18]:
df["author_name"] = df["author_name"].fillna("Unknown").astype(str).str.strip().str.lower()
df["author_avatar"] = df["author_avatar"].fillna("Unknown")

In [19]:
print(df.head(2))

  main_category                    title  average_rating  rating_number  \
0         Books                  Chaucer             4.5             29   
1         Books  Notes from a Kidwatcher             5.0              1   

                                                                                                                                                                                                                                                                                                                                                                                                                features  \
0                                                                                                                                                                                                                                                                                                                                                                         

In [20]:
def clean_categorical(df):
    for col in ["main_category", "publisher", "language"]:
        if col in df.columns:
            df[col] = (
                df[col]
                .replace("None", np.nan)   # chuẩn hoá
                .fillna("Unknown")         
                .astype(str)
                .str.strip()
            )
    return df


In [21]:
df = clean_categorical(df)

print(df[["main_category", "publisher", "language"]].head(20))

print(df[["main_category", "publisher", "language"]].isin(["Unknown"]).sum())


   main_category  \
0          Books   
1          Books   
2          Books   
3          Books   
4   Buy a Kindle   
5          Books   
6          Books   
7          Books   
8          Books   
9          Books   
10         Books   
11         Books   
12         Books   
13         Books   
14         Books   
15         Books   
16         Books   
17         Books   
18         Books   
19  Buy a Kindle   

                                                        publisher language  
0                Chatto & Windus; First Edition (January 1, 2004)  English  
1                         Heinemann; First Edition (May 20, 1996)  English  
2            Little, Brown and Company; 1st edition (May 8, 2012)  English  
3       Scholastic Paperbacks; Reprint edition (October 29, 2013)  English  
4                                                         Unknown  English  
5                     Independently published (December 30, 2021)  English  
6   Giulford ,2004. 2nd Edition; 8082nd 

In [22]:
print(df.isna().sum())

main_category              0
title                      0
average_rating             0
rating_number              0
features                   0
description                0
price                      0
images                     0
videos                     0
categories                 0
details                    0
parent_asin                0
subtitle                   0
author                 28584
cleaned_features           0
cleaned_description        0
price_temp_raw          6756
is_free                    0
is_missing                 0
price_temp_clean         194
is_outlier                 0
main_images                0
details_text               0
details_dict               0
publisher                  0
language                   0
isbn_10                    0
isbn_13                    0
page_count                 0
author_dict                0
author_name                0
author_about               0
author_avatar              0
unique_id                  0
dtype: int64


In [23]:
df_na_price = df[df["price_temp_clean"].isna()]

print("Số dòng còn NaN:", len(df_na_price))
pd.set_option("display.max_colwidth", None)
print(df_na_price[["title", "main_category", "price", "price_temp_raw", "price_temp_clean"]].head(20).to_string(index=False))


Số dòng còn NaN: 194
                                                                                                                    title       main_category price  price_temp_raw  price_temp_clean
                                     DC Collectibles Portfolio 2013 (DC Collectibles Portfolio Summer 2013 / 2014 Book 1)                      None             NaN               NaN
                                                                       Robin Hood (Graphic Revolve: Common Core Editions)                      None             NaN               NaN
                                                                                                      Batgirl (2016-) #13                      None             NaN               NaN
                                                                    Israel World Beat Songbook Music & Vocal Arrangements Musical Instruments  None             NaN               NaN
                                                                  Inc

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [24]:
df["price_temp_clean"] = df["price_temp_clean"].fillna(-1)


In [25]:
print(df.isna().sum())

main_category              0
title                      0
average_rating             0
rating_number              0
features                   0
description                0
price                      0
images                     0
videos                     0
categories                 0
details                    0
parent_asin                0
subtitle                   0
author                 28584
cleaned_features           0
cleaned_description        0
price_temp_raw          6756
is_free                    0
is_missing                 0
price_temp_clean           0
is_outlier                 0
main_images                0
details_text               0
details_dict               0
publisher                  0
language                   0
isbn_10                    0
isbn_13                    0
page_count                 0
author_dict                0
author_name                0
author_about               0
author_avatar              0
unique_id                  0
dtype: int64


In [26]:
# Chọn cột cần thiết cho RAG
cols_keep = [
    "main_category",
    "rating_number",
    "unique_id",
    "title", "subtitle",
    "cleaned_description",
    "cleaned_features",
    "is_free",
    "is_missing",
    "is_outlier",
    "details_text",
    "author_name", "author_about", "author_avatar",
    "categories", "publisher", "language",
    "average_rating", "isbn_10",
    "price_temp_clean", "page_count",
    "isbn_13", "main_images"
]

df_rag = df[cols_keep].copy()

# Tạo cột document_text để embed
text_cols = [
    "title", "subtitle", "cleaned_description",
    "cleaned_features", "author_about", "details_text",
    "categories", "publisher", "language"
]

df_rag["document_text"] = (
    df_rag[text_cols]
    .fillna("Unknown")
    .agg(" ".join, axis=1)
)


print(df_rag.head(3))
print(df_rag.columns)


  main_category  rating_number       unique_id                        title  \
0         Books             29  978-0701169855                      Chaucer   
1         Books              1  978-0435088682      Notes from a Kidwatcher   
2         Books           3421  978-0316185363  Service: A Navy SEAL at War   

                              subtitle  \
0  Hardcover – Import, January 1, 2004   
1                        First Edition   
2              Hardcover – May 8, 2012   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [28]:
import re

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"\s+", " ", text)          
    text = re.sub(r"[{}\"]", "", text)       
    return text.strip()

df_rag["document_text"] = df_rag["document_text"].apply(clean_text)


In [33]:
print(df_rag.head(3))

  main_category  rating_number       unique_id                        title  \
0         Books             29  978-0701169855                      Chaucer   
1         Books              1  978-0435088682      Notes from a Kidwatcher   
2         Books           3421  978-0316185363  Service: A Navy SEAL at War   

                              subtitle  \
0  Hardcover – Import, January 1, 2004   
1                        First Edition   
2              Hardcover – May 8, 2012   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  