In [None]:
import pandas as pd
from copy import deepcopy
import json
import random

In [None]:
# Read the datasets

amazon_books = pd.read_csv('amazon_books_data.csv')
#amazon_reviews = pd.read_csv('amazon_books_rating.csv')

In [None]:
amazon_books.columns

In [None]:
def drop_columns(df, columns_list):
    return df.drop(columns=columns_list)

In [None]:
drop_columns(amazon_books,["ratingsCount", "infoLink"])

In [None]:
def create_dict_from_df(df, group_by_column):
    return df.groupby(group_by_column).apply(lambda x: x.to_dict(orient='records')).to_dict()


In [None]:
create_dict_from_df(amazon_books, "Title")

In [None]:
def remove_square_brackets(str):
    open_bracs = str.find("[")
    close_bracs = str.find("]")
    return str[open_bracs + 2: close_bracs - 1]

def convert_string_to_list(str_list):
    if not isinstance(str_list, str) or str_list == "" :
        return []
    str_list = remove_square_brackets(str_list)
    return str_list.split(",")

def convert_authors_and_categs_to_list(dataset):
    for _, records in dataset.items():
        for record in records:
            categs = convert_string_to_list(record["categories"])
            authors = convert_string_to_list(record["authors"])
            record["categories"] = categs
            record["authors"] = authors
    return dataset

def rename_column(dict, old_name, new_name):
    for _, entries in dict.items():
        for entry in entries:
            value = entry[old_name]
            entry[new_name] = value
            del entry[old_name]
    return dict


In [None]:
def remove_non_useful_books(dict):
    """Removes a books if it is missing categories or description data"""
    new_dict = {}
    for title, books in dict.items():
        new_books = []
        for book in books:
            desc = book["description"]
            if book["categories"] != [] and not (isinstance(desc, float)):
                 new_books.append(book)

        if new_books:
            new_dict[title] = new_books
    return new_dict


In [None]:
def create_json(filename, dict): 
    json_str = json.dumps(dict, indent=4)
    with open(filename, "w", encoding="utf-8") as file:
        file.write(json_str)

In [None]:
def clean_up_amazon_books(df):
    df = drop_columns(df, ["ratingsCount", "infoLink"])
    books_dict = create_dict_from_df(df, "Title")
    books_dict = convert_authors_and_categs_to_list(books_dict)
    books_dict = rename_column(books_dict, "Title", "title")
    books_dict = remove_non_useful_books(books_dict)
    return books_dict

In [None]:
books = clean_up_amazon_books(amazon_books)

In [None]:
num_books = 70000
books_sample = dict(random.sample(list(books.items()), num_books))
create_json("books.json", books_sample)

In [None]:
num_books_with_all_cols = 0
num_books_with_same_title = 0
for _, book_list in books.items():
    length = len(book_list)
    num_books_with_all_cols += length
    num_books_with_all_cols += 1 if length > 1 else 0
num_books_with_all_cols, num_books_with_same_title


In [None]:

# NOTE:Any cell below this is for data exploration, and can be ignored


In [None]:
amazon_reviews.columns

In [None]:
amazon_reviews = amazon_reviews.drop(columns=["Id", "User_id", "profileName", "review/helpfulness","review/time","review/summary"])
amazon_reviews.columns

In [None]:
amazon_books_dict = amazon_books.groupby('Title').apply(lambda x: x.to_dict(orient='records')).to_dict()

In [None]:
amazon_reviews_dict = amazon_reviews.groupby('Title').apply(lambda x: x.to_dict(orient='records')).to_dict()

In [None]:
count = 5
print("\n")
print("Sample Amazon Books:")
for title, entries in list(amazon_books_dict.items())[:count]:
    print(f"Title: {title}")
    print(entries)
    print("\n")

print("\n")
print("Sample Amazon Reviews:")
for title, entries in list(amazon_reviews_dict.items())[:count]:
    print(f"Title: {title}")
    print(entries)
    print("\n")

In [None]:
amazon_reviews_dict["Dr. Seuss: American Icon"]

In [None]:

nan = float('nan')
def clean_up_amazon_reviews_dict(reviews):
    new_amazon_reviews_dict = {}
    for book_title, entries in reviews.items():
        title = book_title
        price = 0
        avg_rating = 0
        reviews = []
        for entry in entries:
            p = entry["Price"]
            rat = entry["review/score"]
            rev = entry["review/text"]

            rat, rev
            if p is not nan:
                price = p
            if rat is not nan:
                avg_rating += rat 
            reviews.append(rev)

        avg_rating = round(avg_rating / len(entries), 1)
        if price == 0:
            price = ""

        if avg_rating == 0:
            avg_rating = ""
        
        new_amazon_reviews_dict[title] = {
            "title": title,
            "price": price,
            "avg_rating": avg_rating,
            "reviews": reviews
        }
    return new_amazon_reviews_dict
new_amazon_reviews_dict = clean_up_amazon_reviews_dict(deepcopy(amazon_reviews_dict))
new_amazon_reviews_dict



In [None]:
new_amazon_reviews_dict["Dr. Seuss: American Icon"]

In [None]:
print(amazon_books_dict["Death Dream"][0]["categories"])
print(amazon_books_dict["Dr. Seuss: American Icon"][0]["categories"])

In [None]:
amazon_books_titles = set(amazon_books_dict.keys())
print(f"Number of amazon books: {len(amazon_books_titles)} \n Number of books in both: {len(similar)}")

In [None]:
complete_books_dataset = {}
complete_reviews_dataset = {}

for title, record in complete_dataset.items():
    complete_reviews_dataset[title] = record["reviews"]

    new_record = deepcopy(record)
    del new_record["reviews"]
    complete_books_dataset[title] = new_record


In [None]:
books_json = json.dumps(complete_books_dataset, indent=4)
with open("books.json", "w", encoding="utf-8") as file:
    file.write(books_json)

# reviews_json = json.dumps(complete_reviews_dataset, indent=4)
# with open("reviews.json", "w", encoding="utf-8") as file:
#     file.write(reviews_json)

In [None]:
[       "description", "authors", 
        "image",
        "previewLink",
        "publisher",
        "publishedDate",
        "categories" ,
        "title",
        "price",
        "avg_rating"]