In [3]:
import pandas as pd
from copy import deepcopy
import json
import random
import re

In [8]:
# Read the datasets

# amazon_books = pd.read_csv('./datasets/amazon_books_data.csv')
amazon_reviews = pd.read_csv('./datasets/amazon_books_rating.csv')

In [None]:
amazon_books.columns

In [3]:
def drop_columns(df, columns_list):
    return df.drop(columns=columns_list)

In [None]:
drop_columns(amazon_books,["ratingsCount", "infoLink"])

In [4]:
def create_dict_from_df(df, group_by_column):
    return df.groupby(group_by_column).apply(lambda x: x.to_dict(orient='records')).to_dict()


In [None]:
create_dict_from_df(amazon_books, "Title")

In [5]:
def remove_square_brackets(str):
    open_bracs = str.find("[")
    close_bracs = str.find("]")
    return str[open_bracs + 2: close_bracs - 1]

def convert_string_to_list(str_list):
    if not isinstance(str_list, str) or str_list == "" :
        return []
    str_list = remove_square_brackets(str_list).strip()
    if not str_list:
        return []
    
    return str_list.split(",")

def convert_authors_and_categs_to_list(dataset):
    for _, records in dataset.items():
        for record in records:
            categs = convert_string_to_list(record["categories"])
            authors = convert_string_to_list(record["authors"])
            record["categories"] = categs
            record["authors"] = authors
    return dataset

def rename_column(dict, old_name, new_name):
    for _, entries in dict.items():
        for entry in entries:
            value = entry[old_name]
            entry[new_name] = value
            del entry[old_name]
    return dict


In [5]:
def remove_non_useful_books(dict):
    """Removes a books if it is missing categories or description data"""
    new_dict = {}
    for title, books in dict.items():
        new_books = []
        for book in books:
            desc = book["description"]
            if book["categories"] != [] and not (isinstance(desc, float)):
                 new_books.append(book)

        if new_books:
            new_dict[title] = new_books
    return new_dict


In [6]:
def create_json(filename, dict): 
    json_str = json.dumps(dict, indent=4)
    with open(filename, "w", encoding="utf-8") as file:
        file.write(json_str)

In [14]:
def clean_up_amazon_books(df):
    df = drop_columns(df, ["ratingsCount", "infoLink"])
    books_dict = create_dict_from_df(df, "Title")
    books_dict = convert_authors_and_categs_to_list(books_dict)
    books_dict = rename_column(books_dict, "Title", "title")
    books_dict = remove_non_useful_books(books_dict)
    return books_dict

In [None]:
books = clean_up_amazon_books(amazon_books)

In [None]:
num_books = 70000
books_sample = dict(random.sample(list(books.items()), num_books))
create_json("books.json", books_sample)

In [None]:
num_books_with_all_cols = 0
num_books_with_same_title = 0
for _, book_list in books.items():
    length = len(book_list)
    num_books_with_all_cols += length
    num_books_with_all_cols += 1 if length > 1 else 0
num_books_with_all_cols, num_books_with_same_title


In [63]:
popular_books = pd.read_csv('datasets/popular2.csv')

In [64]:
popular_books.columns

Index(['title', 'series', 'author', 'rating', 'description', 'language',
       'isbn', 'genres', 'characters', 'bookFormat', 'edition', 'pages',
       'publisher', 'publishDate', 'firstPublishDate', 'awards', 'numRatings',
       'ratingsByStars', 'likedPercent', 'setting', 'bbeScore', 'bbeVotes',
       'price'],
      dtype='object')

In [65]:
popular_books[:5]

Unnamed: 0,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,...,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,bbeScore,bbeVotes,price
0,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780440000000.0,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...","['Katniss Everdeen', 'Peeta Mellark', 'Cato (H...",Hardcover,...,09/14/08,,['Locus Award Nominee for Best Young Adult Boo...,6376780,"['3444695', '1921313', '745221', '171994', '93...",96.0,"['District 12, Panem', 'Capitol, Panem', 'Pane...",2993816,30516,5.09
1,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",4.5,There is a door at the end of a silent corrido...,English,9780440000000.0,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...","['Sirius Black', 'Draco Malfoy', 'Ron Weasley'...",Paperback,...,09/28/04,06/21/03,['Bram Stoker Award for Works for Young Reader...,2507623,"['1593642', '637516', '222366', '39573', '14526']",98.0,['Hogwarts School of Witchcraft and Wizardry (...,2632233,26923,7.38
2,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,The unforgettable novel of a childhood in a sl...,English,10000000000000.0,"['Classics', 'Fiction', 'Historical Fiction', ...","['Scout Finch', 'Atticus Finch', 'Jem Finch', ...",Paperback,...,05/23/06,07-11-1960,"['Pulitzer Prize for Fiction (1961)', 'Audie A...",4501075,"['2363896', '1333153', '573280', '149952', '80...",95.0,"['Maycomb, Alabama (United States)']",2269402,23328,
3,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",4.26,Alternate cover edition of ISBN 9780679783268S...,English,10000000000000.0,"['Classics', 'Fiction', 'Romance', 'Historical...","['Mr. Bennet', 'Mrs. Bennet', 'Jane Bennet', '...",Paperback,...,10-10-2000,01/28/13,[],2998241,"['1617567', '816659', '373311', '113934', '767...",94.0,"['United Kingdom', 'Derbyshire, England (Unite...",1983116,20452,
4,Twilight,The Twilight Saga #1,Stephenie Meyer,3.6,About three things I was absolutely positive.\...,English,9780320000000.0,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...","['Edward Cullen', 'Jacob Black', 'Laurent', 'R...",Paperback,...,09-06-2006,10-05-2005,"['Georgia Peach Book Award (2007)', 'Buxtehude...",4964519,"['1751460', '1113682', '1008686', '542017', '5...",78.0,"['Forks, Washington (United States)', 'Phoenix...",1459448,14874,2.1


In [66]:
drop_columns(popular_books,['series', 'rating', 'language',
       'isbn','characters', 'bookFormat', 'edition', 'pages',
       'publisher', 'firstPublishDate', 'awards', 'numRatings',
       'ratingsByStars', 'likedPercent', 'setting', 'bbeScore', 'bbeVotes',
       'price'])

Unnamed: 0,title,author,description,genres,publishDate
0,The Hunger Games,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",09/14/08
1,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré (Illustrator)",There is a door at the end of a silent corrido...,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",09/28/04
2,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ...",05/23/06
3,Pride and Prejudice,"Jane Austen, Anna Quindlen (Introduction)",Alternate cover edition of ISBN 9780679783268S...,"['Classics', 'Fiction', 'Romance', 'Historical...",10-10-2000
4,Twilight,Stephenie Meyer,About three things I was absolutely positive.\...,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",09-06-2006
...,...,...,...,...,...
52473,Fractured,Cheri Schmidt (Goodreads Author),The Fateful Trilogy continues with Fractured. ...,"['Vampires', 'Paranormal', 'Young Adult', 'Rom...",May 28th 2011
52474,Anasazi,Emma Michaels,"'Anasazi', sequel to 'The Thirteenth Chime' by...","['Mystery', 'Young Adult']",August 5th 2011
52475,Marked,Kim Richardson (Goodreads Author),--READERS FAVORITE AWARDS WINNER 2011--Sixteen...,"['Fantasy', 'Young Adult', 'Paranormal', 'Ange...",March 18th 2011
52476,Wayward Son,"Tom Pollack (Goodreads Author), John Loftus (G...",A POWERFUL TREMOR UNEARTHS AN ANCIENT SECRETBu...,"['Fiction', 'Mystery', 'Historical Fiction', '...",September 1st 2011


In [30]:
def remove_square_brackets(str):
    open_bracs = str.find("[")
    close_bracs = str.find("]")
    return str[open_bracs + 2: close_bracs - 1]

def convert_string_to_list(str_list):
    if not isinstance(str_list, str) or str_list == "" :
        return []
    str_list = remove_square_brackets(str_list).strip()
    if not str_list:
        return []
    return [item.strip(" '\"") for item in str_list.split(",")]

def categs_to_list(dataset):
    for _, records in dataset.items():
        for record in records:
            categs = convert_string_to_list(record["categories"])
            if len(categs) == 1 and categs[0] == "":
                categs = []
            record["categories"] = categs
    return dataset

def remove_parentheses(text: str) -> str:
    return re.sub(r"\s*\([^)]*\)", "", text).strip()

def authors_to_list(dataset):
    for _, records in dataset.items():
        for record in records:
            author_field = record.get("authors", "")
            if isinstance(author_field, str) and author_field.strip():
                cleaned = remove_parentheses(author_field)
                record["authors"] = [cleaned]
            elif not author_field:
                record["authors"] = []
    return dataset

def remove_nan_values(dict):
    NOT_AVAILABLE = "NOT AVAILABLE"
    for title, entries in dict.items():
        for entry in entries:
            for key, value in entry.items():
                if not isinstance(value, list) and pd.isna(value):  # works for both float NaN and None
                    entry[key] = NOT_AVAILABLE
    return dict




In [114]:
def clean_popular_books(df):
   df = drop_columns(df, [])
   df = df[df['language'].str.lower() == 'english']
   df = drop_columns(df,['series', 'rating', 'language',
      'isbn','characters', 'bookFormat', 'edition', 'pages',
      'publisher', 'firstPublishDate', 'awards', 'numRatings',
      'ratingsByStars', 'likedPercent', 'setting', 'bbeScore', 'bbeVotes',
      'price'])
   
   popular_dict = create_dict_from_df(df, "title")

   popular_dict = rename_column(popular_dict, 'genres', 'categories')
   popular_dict = categs_to_list(popular_dict)

   popular_dict = rename_column(popular_dict, 'author', 'authors')
   popular_dict = authors_to_list(popular_dict)

   popular_dict = remove_nan_values(popular_dict)

   return popular_dict

In [115]:
popular = clean_popular_books(popular_books)
popular

  return df.groupby(group_by_column).apply(lambda x: x.to_dict(orient='records')).to_dict()


{'"A Problem from Hell": America and the Age of Genocide': [{'title': '"A Problem from Hell": America and the Age of Genocide',
   'publishDate': 'May 6th 2003',
   'categories': ['Nonfiction',
    'History',
    'Politics',
    'War',
    'International Relations',
    'Africa',
    'American History',
    'Political Science',
    'Social Justice',
    'School'],
   'authors': ['Samantha Power']}],
 '"Dead Rock Stars"': [{'title': '"Dead Rock Stars"',
   'description': "Emma Imrie was a Plath-obsessed, self-taught teenage musician dreaming of fame, from a remote village on the Isle of Wight. She found it too, briefly becoming a star of the nineties Camden music scene. But then she died in mysterious circumstances.In the aftermath of Emma's death, her younger brother, Jeff, is forced by their parents to stay at the opulent home of childhood friends on the island.During a wild summer of beach parties and music, Jeff faces up to the challenges that come with young love, youthful ambition

In [116]:
popular_json = json.dumps(popular, indent=4)
with open("popular.json", "w", encoding="utf-8") as file:
    file.write(popular_json)

In [None]:

# NOTE:Any cell below this is for data exploration, and can be ignored


In [9]:
amazon_reviews.columns

Index(['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness',
       'review/score', 'review/time', 'review/summary', 'review/text'],
      dtype='object')

In [10]:
amazon_reviews = amazon_reviews.drop(columns=["Id", "User_id", "profileName", "review/helpfulness","review/time","review/summary"])
amazon_reviews.columns

Index(['Title', 'Price', 'review/score', 'review/text'], dtype='object')

In [None]:
amazon_books_dict = amazon_books.groupby('Title').apply(lambda x: x.to_dict(orient='records')).to_dict()

In [11]:
amazon_reviews_dict = amazon_reviews.groupby('Title').apply(lambda x: x.to_dict(orient='records')).to_dict()

  amazon_reviews_dict = amazon_reviews.groupby('Title').apply(lambda x: x.to_dict(orient='records')).to_dict()


In [None]:
count = 5
print("\n")
print("Sample Amazon Books:")
for title, entries in list(amazon_books_dict.items())[:count]:
    print(f"Title: {title}")
    print(entries)
    print("\n")

print("\n")
print("Sample Amazon Reviews:")
for title, entries in list(amazon_reviews_dict.items())[:count]:
    print(f"Title: {title}")
    print(entries)
    print("\n")

In [12]:
amazon_reviews_dict["Dr. Seuss: American Icon"]

[{'Title': 'Dr. Seuss: American Icon',
  'Price': nan,
  'review/score': 5.0,
  'review/text': "I don't care much for Dr. Seuss but after reading Philip Nel's book I changed my mind--that's a good testimonial to the power of Rel's writing and thinking. Rel plays Dr. Seuss the ultimate compliment of treating him as a serious poet as well as one of the 20th century's most interesting visual artists, and after reading his book I decided that a trip to the Mandeville Collections of the library at University of California in San Diego was in order, so I could visit some of the incredible Seuss/Geisel holdings they have there.There's almost too much to take in, for, like William Butler Yeats, Seuss led a career that constantly shifted and metamoprhized itself to meet new historical and political cirsumstances, so he seems to have been both a leftist and a conservative at different junctures of his career, both in politics and in art. As Nel shows us, he was once a cartoonist for the fabled P

In [13]:

nan = float('nan')
def clean_up_amazon_reviews_dict(reviews):
    new_amazon_reviews_dict = {}
    for book_title, entries in reviews.items():
        title = book_title
        price = 0
        avg_rating = 0
        reviews = []
        for entry in entries:
            p = entry["Price"]
            rat = entry["review/score"]
            rev = entry["review/text"]

            rat, rev
            if p is not nan:
                price = p
            if rat is not nan:
                avg_rating += rat 
            reviews.append(rev)

        avg_rating = round(avg_rating / len(entries), 1)
        if price == 0:
            price = ""

        if avg_rating == 0:
            avg_rating = ""
        
        new_amazon_reviews_dict[title] = {
            "title": title,
            "price": price,
            "avg_rating": avg_rating,
            "reviews": reviews
        }
    return new_amazon_reviews_dict
new_amazon_reviews_dict = clean_up_amazon_reviews_dict(deepcopy(amazon_reviews_dict))
new_amazon_reviews_dict



{'" Film technique, " and, " Film acting "': {'title': '" Film technique, " and, " Film acting "',
  'price': nan,
  'avg_rating': 4.5,
  'reviews': ["This volume is actually two books. I won't comment on Film Acting as I'm not familiar with it but I do have a copy of Film Technique. A good little tome on the art of editing that is much easier to get into than Eisenstein's indigestable texts. It has been unfairly neglected by film schools over the year and I am surprised that no one has done a re-print.BTW - Stanley Kubrick reccomends this book to anyone who has any interest in the cinema.",
   'A must read for film lovers and students. Gives you an important outline of film basics and standards. This is a book within the bible of Film.']},
 '" We\'ll Always Have Paris": The Definitive Guide to Great Lines from the Movies': {'title': '" We\'ll Always Have Paris": The Definitive Guide to Great Lines from the Movies',
  'price': nan,
  'avg_rating': 5.0,
  'reviews': ["You would only be 

In [14]:
new_amazon_reviews_dict["Dr. Seuss: American Icon"]

{'title': 'Dr. Seuss: American Icon',
 'price': nan,
 'avg_rating': 4.6,
 'reviews': ["I don't care much for Dr. Seuss but after reading Philip Nel's book I changed my mind--that's a good testimonial to the power of Rel's writing and thinking. Rel plays Dr. Seuss the ultimate compliment of treating him as a serious poet as well as one of the 20th century's most interesting visual artists, and after reading his book I decided that a trip to the Mandeville Collections of the library at University of California in San Diego was in order, so I could visit some of the incredible Seuss/Geisel holdings they have there.There's almost too much to take in, for, like William Butler Yeats, Seuss led a career that constantly shifted and metamoprhized itself to meet new historical and political cirsumstances, so he seems to have been both a leftist and a conservative at different junctures of his career, both in politics and in art. As Nel shows us, he was once a cartoonist for the fabled PM magazin

In [12]:
print(amazon_books_dict["Death Dream"][0]["categories"])
print(amazon_books_dict["Dr. Seuss: American Icon"][0]["categories"])

NameError: name 'amazon_books_dict' is not defined

In [None]:
amazon_books_titles = set(amazon_books_dict.keys())
print(f"Number of amazon books: {len(amazon_books_titles)} \n Number of books in both: {len(similar)}")

In [None]:
complete_books_dataset = {}
complete_reviews_dataset = {}

for title, record in complete_dataset.items():
    complete_reviews_dataset[title] = record["reviews"]

    new_record = deepcopy(record)
    del new_record["reviews"]
    complete_books_dataset[title] = new_record


In [16]:
# books_json = json.dumps(complete_books_dataset, indent=4)
# with open("books.json", "w", encoding="utf-8") as file:
#     file.write(books_json)

reviews_json = json.dumps(new_amazon_reviews_dict, indent=4)


In [None]:
# with open("reviews.json", "w", encoding="utf-8") as file:
#     file.write(reviews_json)

OSError: [Errno 28] No space left on device

In [22]:
with open('../data/books.json', 'r') as file:
    books = json.load(file)

In [2]:
reviews = new_amazon_reviews_dict

NameError: name 'new_amazon_reviews_dict' is not defined

In [1]:
all_titles = list(books.keys())

# Step 2: Randomly select 10,000 unique titles
sample_titles = random.sample(all_titles, 10000)

# Step 3: Create new dictionaries for the sampled books and their reviews
sampled_books = {title: books[title] for title in sample_titles}
sampled_reviews = {title: reviews.get(title, []) for title in sample_titles}

NameError: name 'books' is not defined

In [32]:
for books in sampled_books.values():
    for book in books:
        reviews = book.get("reviews", [])
        book["reviews"] = random.sample(reviews, min(2, len(reviews)))


In [33]:
with open("sampled_books.json", "w") as f:
    json.dump(sampled_books, f, indent=2)

with open("sampled_reviews.json", "w") as f:
    json.dump(sampled_reviews, f, indent=2)


In [None]:
# with open('../data/sampled_books.json', 'r') as file:
#     sampled_books = json.load(file)

# with open('../data/sampled_reviews.json', 'r') as file:
#     sampled_reviews = json.load(file)

In [8]:
all_titles = list(sampled_books.keys())

# Step 2: Randomly select 10,000 unique titles
sample_titles = random.sample(all_titles, 5000)

# Step 3: Create new dictionaries for the sampled books and their reviews
sampled_books = {title: books[title] for title in sample_titles}
sampled_reviews = {title: reviews.get(title, []) for title in sample_titles}

In [None]:
# with open("sampled_books.json", "w") as f:
#     json.dump(sampled_books, f, indent=2)

# with open("sampled_reviews.json", "w") as f:
#     json.dump(sampled_reviews, f, indent=2)

In [4]:

with open('../data/sampled_reviews.json', 'r') as file:
    sampled_reviews = json.load(file)

  
# with open('../data/sampled_books.json', 'r') as file:
#     sampled_books = json.load(file)  


In [5]:

def remove_nan(dict):
    NOT_AVAILABLE = "NOT AVAILABLE"
    for title, entry in dict.items():
        for key, value in entry.items():
            if not isinstance(value, list) and pd.isna(value):  # works for both float NaN and None
                entry[key] = NOT_AVAILABLE
    return dict


sampled_reviews = remove_nan(sampled_reviews)


In [6]:
with open("sampled_reviews.json", "w") as f:
    json.dump(sampled_reviews, f, indent=2)

# with open("sampled_books.json", "w") as f:
#     json.dump(sampled_books, f, indent=2)

In [2]:

def remove_nan(dict):
    NOT_AVAILABLE = "NOT AVAILABLE"
    for title, entry in dict.items():
        for key, value in entry.items():
            if not isinstance(value, list) and pd.isna(value):  # works for both float NaN and None
                entry[key] = NOT_AVAILABLE
    return dict

with open('../data/sampled_reviews.json', 'r') as file:
    sampled_reviews = json.load(file)

sampled_reviews = remove_nan(sampled_reviews)

with open("sampled_reviews.json", "w") as f:
    json.dump(sampled_reviews, f, indent=2)