In [1]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset(
    "cogsci13/Amazon-Reviews-2023-Books-Meta",
    split="full"
)

print(dataset)   


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/28 [00:00<?, ?files/s]

data/full-00000-of-00028.parquet:   0%|          | 0.00/322M [00:00<?, ?B/s]

data/full-00001-of-00028.parquet:   0%|          | 0.00/306M [00:00<?, ?B/s]

data/full-00002-of-00028.parquet:   0%|          | 0.00/305M [00:00<?, ?B/s]

data/full-00003-of-00028.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

data/full-00004-of-00028.parquet:   0%|          | 0.00/305M [00:00<?, ?B/s]

data/full-00005-of-00028.parquet:   0%|          | 0.00/305M [00:00<?, ?B/s]

data/full-00006-of-00028.parquet:   0%|          | 0.00/305M [00:00<?, ?B/s]

data/full-00007-of-00028.parquet:   0%|          | 0.00/305M [00:00<?, ?B/s]

data/full-00008-of-00028.parquet:   0%|          | 0.00/305M [00:00<?, ?B/s]

data/full-00009-of-00028.parquet:   0%|          | 0.00/305M [00:00<?, ?B/s]

data/full-00010-of-00028.parquet:   0%|          | 0.00/302M [00:00<?, ?B/s]

data/full-00011-of-00028.parquet:   0%|          | 0.00/302M [00:00<?, ?B/s]

data/full-00012-of-00028.parquet:   0%|          | 0.00/295M [00:00<?, ?B/s]

data/full-00013-of-00028.parquet:   0%|          | 0.00/275M [00:00<?, ?B/s]

data/full-00014-of-00028.parquet:   0%|          | 0.00/297M [00:00<?, ?B/s]

data/full-00015-of-00028.parquet:   0%|          | 0.00/289M [00:00<?, ?B/s]

data/full-00016-of-00028.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

data/full-00017-of-00028.parquet:   0%|          | 0.00/294M [00:00<?, ?B/s]

data/full-00018-of-00028.parquet:   0%|          | 0.00/293M [00:00<?, ?B/s]

data/full-00019-of-00028.parquet:   0%|          | 0.00/295M [00:00<?, ?B/s]

data/full-00020-of-00028.parquet:   0%|          | 0.00/294M [00:00<?, ?B/s]

data/full-00021-of-00028.parquet:   0%|          | 0.00/293M [00:00<?, ?B/s]

data/full-00022-of-00028.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

data/full-00023-of-00028.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

data/full-00024-of-00028.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

data/full-00025-of-00028.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

data/full-00026-of-00028.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

data/full-00027-of-00028.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

Generating full split:   0%|          | 0/4448181 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/28 [00:00<?, ?it/s]

Dataset({
    features: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'subtitle', 'author'],
    num_rows: 4448181
})


In [2]:
import numpy as np
import pandas as pd
import ast
import re

def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    def clean_features(x):
        if isinstance(x, np.ndarray):
            return " ".join([str(i) for i in x if i])
        return ""
    
    def clean_description(x):
        if isinstance(x, np.ndarray):
            return " ".join([str(i) for i in x if i])
        return ""
    
    df["cleaned_features"] = df["features"].apply(clean_features)
    df["cleaned_description"] = df["description"].apply(clean_description)

    def parse_price(x):
        if pd.isna(x):
            return np.nan
        x = str(x).strip().lower()
        if x in ["0", "0.0", "0.00", "free"]:
            return 0.0
        x = x.replace("$", "").replace("£", "").replace("€", "").replace(",", "")
        try:
            return float(x)
        except:
            return np.nan
    
    df["price_temp_raw"] = df["price"].apply(parse_price)
    df["is_free"] = (df["price_temp_raw"] == 0).astype(int)
    df["is_missing"] = df["price_temp_raw"].isna().astype(int)
    
    median_books = df.loc[
        (df["main_category"] == "Books") &
        (df["price_temp_raw"].between(1, 120, inclusive="both")),
        "price_temp_raw"
    ].median()
    
    df["price_temp_clean"] = df["price_temp_raw"]
    mask_kindle_audi = df["price_temp_clean"].isna() & df["main_category"].isin(["Buy a Kindle", "Audible Audiobooks"])
    df.loc[mask_kindle_audi, "price_temp_clean"] = 0
    mask_books_nan = df["price_temp_clean"].isna() & (df["main_category"] == "Books")
    df.loc[mask_books_nan, "price_temp_clean"] = median_books
    mask_books_outlier = (df["main_category"] == "Books") & (df["price_temp_raw"] > 120)
    df.loc[mask_books_outlier, "price_temp_clean"] = 120
    df["is_outlier"] = 0
    df.loc[mask_books_outlier, "is_outlier"] = 1
    
    df["main_images"] = df["images"].apply(
        lambda x: x["large"][0] if isinstance(x, dict) and "large" in x and len(x["large"]) > 0 else "Unknown"
    ).astype(str)
    
    df["categories"] = df["categories"].apply(
        lambda x: " > ".join(x.tolist() if isinstance(x, np.ndarray) else x)
        if isinstance(x, (list, np.ndarray)) else str(x)
    )
    
    df["details_text"] = df["details"]
    def safe_parse(x):
        if isinstance(x, str):
            try:
                return ast.literal_eval(x)
            except:
                return None
        return None
    df["details_dict"] = df["details"].apply(safe_parse)
    def extract_field(d, key):
        if isinstance(d, dict):
            return d.get(key)
        return None
    df["publisher"]  = df["details_dict"].apply(lambda d: extract_field(d, "Publisher"))
    df["language"]   = df["details_dict"].apply(lambda d: extract_field(d, "Language"))
    df["isbn_10"]    = df["details_dict"].apply(lambda d: extract_field(d, "ISBN 10"))
    df["isbn_13"]    = df["details_dict"].apply(lambda d: extract_field(d, "ISBN 13"))
    df["page_count"] = df["details_dict"].apply(
        lambda d: d.get("Hardcover") or d.get("Paperback") if isinstance(d, dict) else None
    )
    
    df = df.drop(columns=["bought_together", "video", "store"], errors="ignore")
    
    def safe_parse_author(x):
        if isinstance(x, dict):
            return x
        if isinstance(x, str) and x.strip():
            try:
                return ast.literal_eval(x)
            except Exception:
                return {}
        return {}
    df["author_dict"] = df["author"].apply(safe_parse_author)
    df["author_name"] = df["author_dict"].apply(lambda x: x.get("name") if isinstance(x, dict) else "Unknown")
    df["author_about"] = df["author_dict"].apply(lambda x: " ".join(x.get("about", [])) if isinstance(x, dict) else "")
    df["author_avatar"] = df["author_dict"].apply(lambda x: x.get("avatar") if isinstance(x, dict) else "Unknown")
    
    df["unique_id"] = df["isbn_13"].fillna(df["parent_asin"])
    df["isbn_10"] = df["isbn_10"].fillna("Unknown")
    df["isbn_13"] = df["isbn_13"].fillna("Unknown")
    
    if "page_count" in df.columns:
        df["page_count"] = (
            df["page_count"]
            .astype(str)
            .str.extract(r"(\d+)")
            .astype(float)
            .fillna(-1)
            .astype(int)
        )
    
    for col in ["main_category", "publisher", "language"]:
        if col in df.columns:
            df[col] = (
                df[col]
                .replace("None", np.nan)
                .fillna("Unknown")
                .astype(str)
                .str.strip()
            )
    
    df["subtitle"] = df["subtitle"].fillna("Unknown").astype(str)
    df["author_name"] = df["author_name"].fillna("Unknown").astype(str).str.strip().str.lower()
    
    cols_keep = [
        "main_category", "rating_number", "unique_id",
        "title", "subtitle", "cleaned_description", "cleaned_features",
        "is_free", "is_missing", "is_outlier", "details_text",
        "author_name", "author_about", "author_avatar",
        "categories", "publisher", "language",
        "average_rating", "isbn_10", "price_temp_clean", "page_count",
        "isbn_13", "main_images"
    ]
    df_rag = df[cols_keep].copy()
    
    text_cols = [
        "title", "subtitle", "cleaned_description", "cleaned_features",
        "author_about", "details_text", "categories", "publisher", "language"
    ]
    df_rag["document_text"] = (
        df_rag[text_cols]
        .fillna("Unknown")
        .agg(" ".join, axis=1)
    )
    
    def clean_text(text):
        if not isinstance(text, str):
            return ""
        text = text.lower()
        text = re.sub(r"\s+", " ", text)
        text = re.sub(r"[{}\"]", "", text)
        return text.strip()
    
    df_rag["document_text"] = df_rag["document_text"].apply(clean_text)

    df_rag["author_avatar"] = df_rag["author_avatar"].fillna("Unknown")
    df_rag["price_temp_clean"] = df_rag["price_temp_clean"].fillna(-1)
    
    return df_rag


In [3]:
num_shards = 50
shard = dataset.shard(num_shards=num_shards, index=0)

df = shard.to_pandas()

df_rag = process_dataframe(df)

df_rag.to_parquet("amazon_books_shard_test2.parquet", index=False)

print(" Đã xử lý xong shard test")
print(df_rag.head(3))
print("Shape sau xử lý:", df_rag.shape)

  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý xong shard test
  main_category  rating_number       unique_id                        title  \
0         Books             29  978-0701169855                      Chaucer   
1         Books              1  978-0435088682      Notes from a Kidwatcher   
2         Books           3421  978-0316185363  Service: A Navy SEAL at War   

                              subtitle  \
0  Hardcover – Import, January 1, 2004   
1                        First Edition   
2              Hardcover – May 8, 2012   

                                 cleaned_description  \
0                                                      
1  About the Author SANDRA WILDE, Ph.D., is widel...   
2  Review Praise for SERVICE"An action-packed...r...   

                                    cleaned_features  is_free  is_missing  \
0                                                           0           0   
1  Contains 23 selected articles by this influent...        0           0   
2  Marcus Luttrell, author of th

In [4]:
df_rag.isna().sum()

main_category          0
rating_number          0
unique_id              0
title                  0
subtitle               0
cleaned_description    0
cleaned_features       0
is_free                0
is_missing             0
is_outlier             0
details_text           0
author_name            0
author_about           0
author_avatar          0
categories             0
publisher              0
language               0
average_rating         0
isbn_10                0
price_temp_clean       0
page_count             0
isbn_13                0
main_images            0
document_text          0
dtype: int64

In [5]:
def sanitize_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].astype(str).apply(
            lambda x: x.encode("utf-8", "replace").decode("utf-8")
        )
    return df

num_shards = 50

for i in range(num_shards):
    shard = dataset.shard(num_shards=num_shards, index=i)
    df = shard.to_pandas()
    
    df_rag = process_dataframe(df)
    
    df_rag = sanitize_strings(df_rag)
    
    out_path = f"/kaggle/working/amazon_books_shard_{i}.parquet"
    df_rag.to_parquet(out_path, index=False)
    print(f" Đã xử lý shard {i+1}/{num_shards} -> {out_path}")


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 1/50 -> /kaggle/working/amazon_books_shard_0.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 2/50 -> /kaggle/working/amazon_books_shard_1.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 3/50 -> /kaggle/working/amazon_books_shard_2.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 4/50 -> /kaggle/working/amazon_books_shard_3.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 5/50 -> /kaggle/working/amazon_books_shard_4.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 6/50 -> /kaggle/working/amazon_books_shard_5.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 7/50 -> /kaggle/working/amazon_books_shard_6.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 8/50 -> /kaggle/working/amazon_books_shard_7.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 9/50 -> /kaggle/working/amazon_books_shard_8.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 10/50 -> /kaggle/working/amazon_books_shard_9.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 11/50 -> /kaggle/working/amazon_books_shard_10.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 12/50 -> /kaggle/working/amazon_books_shard_11.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 13/50 -> /kaggle/working/amazon_books_shard_12.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 14/50 -> /kaggle/working/amazon_books_shard_13.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 15/50 -> /kaggle/working/amazon_books_shard_14.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 16/50 -> /kaggle/working/amazon_books_shard_15.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 17/50 -> /kaggle/working/amazon_books_shard_16.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 18/50 -> /kaggle/working/amazon_books_shard_17.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 19/50 -> /kaggle/working/amazon_books_shard_18.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 20/50 -> /kaggle/working/amazon_books_shard_19.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 21/50 -> /kaggle/working/amazon_books_shard_20.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 22/50 -> /kaggle/working/amazon_books_shard_21.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 23/50 -> /kaggle/working/amazon_books_shard_22.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 24/50 -> /kaggle/working/amazon_books_shard_23.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 25/50 -> /kaggle/working/amazon_books_shard_24.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 26/50 -> /kaggle/working/amazon_books_shard_25.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 27/50 -> /kaggle/working/amazon_books_shard_26.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 28/50 -> /kaggle/working/amazon_books_shard_27.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 29/50 -> /kaggle/working/amazon_books_shard_28.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 30/50 -> /kaggle/working/amazon_books_shard_29.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 31/50 -> /kaggle/working/amazon_books_shard_30.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 32/50 -> /kaggle/working/amazon_books_shard_31.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 33/50 -> /kaggle/working/amazon_books_shard_32.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 34/50 -> /kaggle/working/amazon_books_shard_33.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 35/50 -> /kaggle/working/amazon_books_shard_34.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 36/50 -> /kaggle/working/amazon_books_shard_35.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 37/50 -> /kaggle/working/amazon_books_shard_36.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 38/50 -> /kaggle/working/amazon_books_shard_37.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 39/50 -> /kaggle/working/amazon_books_shard_38.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 40/50 -> /kaggle/working/amazon_books_shard_39.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 41/50 -> /kaggle/working/amazon_books_shard_40.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 42/50 -> /kaggle/working/amazon_books_shard_41.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 43/50 -> /kaggle/working/amazon_books_shard_42.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 44/50 -> /kaggle/working/amazon_books_shard_43.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 45/50 -> /kaggle/working/amazon_books_shard_44.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 46/50 -> /kaggle/working/amazon_books_shard_45.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 47/50 -> /kaggle/working/amazon_books_shard_46.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 48/50 -> /kaggle/working/amazon_books_shard_47.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 49/50 -> /kaggle/working/amazon_books_shard_48.parquet


  return op(a, b)
  return op(a, b)
  return op(a, b)


 Đã xử lý shard 50/50 -> /kaggle/working/amazon_books_shard_49.parquet
