# Retreat JSON

Ratings

In [1]:
import rapidjson as json

with open("data/raw_ratings.json", "r") as f:
    ratings = [json.loads(line.strip()) for line in f.readlines()]

len(ratings), ratings[0]

(6739590,
 {'overall': 5.0,
  'vote': '67',
  'verified': True,
  'reviewTime': '09 18, 1999',
  'reviewerID': 'AAP7PPBU72QFM',
  'asin': '0151004714',
  'style': {'Format:': ' Hardcover'},
  'reviewerName': 'D. C. Carrad',
  'reviewText': 'This is the best novel I have read in 2 or 3 years.  It is everything that fiction should be -- beautifully written, engaging, well-plotted and structured.  It has several layers of meanings -- historical, family,  philosophical and more -- and blends them all skillfully and interestingly.  It makes the American grad student/writers\' workshop "my parents were  mean to me and then my professors were mean to me" trivia look  childish and silly by comparison, as they are.\nAnyone who says this is an  adolescent girl\'s coming of age story is trivializing it.  Ignore them.  Read this book if you love literature.\nI was particularly impressed with  this young author\'s grasp of the meaning and texture of the lost world of  French Algeria in the 1950\'s 

- Keep only verified content
- Change reviewTime into %Y%m%d format
- Removing 'style', 'reviewerName', 'unixReviewTime', 'verified' information
- Changing 'overall' to 'mark' and 'asin' to 'product', ...
- Adjust reviewText

In [2]:
from datetime import datetime
from tqdm import tqdm
import re
from bs4 import BeautifulSoup

def transform_and_filter(r):
    if r.get('verified') != True:
        return None

    new_r = r.copy()

    new_r['reviewTime'] = datetime.strptime(r['reviewTime'], '%m %d, %Y').strftime('%Y%m%d')

    for key in ['style', 'reviewerName', 'unixReviewTime', 'verified']:
        new_r.pop(key, None)

    if not 'reviewText' in new_r:
        new_r['reviewText'] = ''

    new_r['mark'] = new_r.pop('overall', None)
    new_r['product'] = new_r.pop('asin', None)
    new_r['date'] = new_r.pop('reviewTime', None)
    new_r['user'] = new_r.pop('reviewerID', None)
    new_r['comment'] = re.sub(' +', ' ', BeautifulSoup(new_r.pop('reviewText', None), 'html.parser').get_text().strip().replace('\n', '').strip())
    new_r['title'] = new_r.pop('summary', None)

    return new_r

filtered_ratings = [transform_and_filter(r) for r in tqdm(ratings)]
filtered_ratings = [r for r in filtered_ratings if r is not None]
len(filtered_ratings), filtered_ratings[0]

  new_r['comment'] = re.sub(' +', ' ', BeautifulSoup(new_r.pop('reviewText', None), 'html.parser').get_text().strip().replace('\n', '').strip())
  new_r['comment'] = re.sub(' +', ' ', BeautifulSoup(new_r.pop('reviewText', None), 'html.parser').get_text().strip().replace('\n', '').strip())
100%|██████████| 6739590/6739590 [07:48<00:00, 14380.52it/s]


(6038416,
 {'vote': '67',
  'mark': 5.0,
  'product': '0151004714',
  'date': '19990918',
  'user': 'AAP7PPBU72QFM',
  'comment': 'This is the best novel I have read in 2 or 3 years. It is everything that fiction should be -- beautifully written, engaging, well-plotted and structured. It has several layers of meanings -- historical, family, philosophical and more -- and blends them all skillfully and interestingly. It makes the American grad student/writers\' workshop "my parents were mean to me and then my professors were mean to me" trivia look childish and silly by comparison, as they are.Anyone who says this is an adolescent girl\'s coming of age story is trivializing it. Ignore them. Read this book if you love literature.I was particularly impressed with this young author\'s grasp of the meaning and texture of the lost world of French Algeria in the 1950\'s and \'60\'s...particularly poignant when read in 1999 from another ruined and abandoned French colony, amid the decaying buil

In [3]:
with open("data/ratings.json", "w") as f:
    json.dump(filtered_ratings, f)

In [4]:
import rapidjson as json

with open("data/ratings.json", "r") as f:
    filtered_ratings = json.load(f)

Products

In [5]:
import concurrent.futures
import rapidjson as json

def load_json_line(line):
    return json.loads(line.strip())

products = []

with open("data/raw_products.json", "r") as f:
    lines = f.readlines()

with concurrent.futures.ThreadPoolExecutor() as executor:
    products = list(executor.map(load_json_line, lines))

len(products), products[0]

(786445,
 {'category': ['Electronics',
   'Camera &amp; Photo',
   'Video Surveillance',
   'Surveillance Systems',
   'Surveillance DVR Kits'],
  'tech1': '',
  'description': ['The following camera brands and models have been tested for compatibility with GV-Software.\nGeoVision \tACTi \tArecont Vision \tAXIS \tBosch \tCanon\nCNB \tD-Link \tEtroVision \tHikVision \tHUNT \tIQEye\nJVC \tLG \tMOBOTIX \tPanasonic \tPelco \tSamsung\nSanyo \tSony \tUDP \tVerint \tVIVOTEK \t \n \nCompatible Standard and Protocol\nGV-System also allows for integration with all other IP video devices compatible with ONVIF(V2.0), PSIA (V1.1) standards, or RTSP protocol.\nONVIF \tPSIA \tRTSP \t  \t  \t \nNote: Specifications are subject to change without notice. Every effort has been made to ensure that the information on this Web site is accurate. No liability is assumed for incidental or consequential damages arising from the use of the information or products contained herein.'],
  'fit': '',
  'title': 'Gen

In [6]:
from tqdm import tqdm
import re
from datetime import datetime

products_to_keep = set(r["product"] for r in filtered_ratings)
pattern = re.compile(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?')
date_format = '%B %d, %Y'
new_date_format = '%Y%m%d'

def extract_price(s):
    match = pattern.fullmatch(s)
    
    if match:
        return float(s.replace('$', '').replace(',', ''))
    return None

def transform_and_filter(p):
    asin = p["asin"]
    if asin not in products_to_keep:
        return None, asin

    new_p = {
        "id": asin,
        "categories": p["category"],
        "brand": p["brand"],
        "features": p["feature"],
        "price": extract_price(p["price"])
    }
    
    for i in range(len(new_p["categories"])):
        new_p["categories"][i] = re.sub(' +', ' ', BeautifulSoup(new_p["categories"][i], 'html.parser').get_text().strip().replace('\n', '').strip())
    
    for i in range(len(new_p["features"])):
        new_p["features"][i] = re.sub(' +', ' ', BeautifulSoup(new_p["features"][i], 'html.parser').get_text().strip().replace('\n', '').strip())
    
    if new_p["price"] is None:
        return None, asin
    
    try:
        new_p["date"] = datetime.strptime(p["date"].strip(), '%B %d, %Y').strftime('%Y%m%d')
    except:
        return None, asin

    return new_p, 0

filtered_products = []
removed_products = []

for p in tqdm(products):
    new_p, removed = transform_and_filter(p)
    if new_p is None:
        removed_products.append(removed)
    else:
        filtered_products.append(new_p)

len(filtered_products), len(removed_products), filtered_products[0]

  new_p["features"][i] = re.sub(' +', ' ', BeautifulSoup(new_p["features"][i], 'html.parser').get_text().strip().replace('\n', '').strip())
  new_p["categories"][i] = re.sub(' +', ' ', BeautifulSoup(new_p["categories"][i], 'html.parser').get_text().strip().replace('\n', '').strip())
  new_p["features"][i] = re.sub(' +', ' ', BeautifulSoup(new_p["features"][i], 'html.parser').get_text().strip().replace('\n', '').strip())
100%|██████████| 786445/786445 [01:17<00:00, 10205.14it/s]


(69379,
 717066,
 {'id': '0594459451',
  'categories': ['Electronics', 'eBook Readers & Accessories', 'Power Cables'],
  'brand': 'Barnes &amp; Noble',
  'features': ["BUY MORE AND SAVE! Purchase 2 of this Item and SAVE 25% Buy 3 SAVE 30% Buy 4 SAVE 32% Here's how (restrictions apply)"],
  'price': 6.04,
  'date': '20140801'})

In [7]:
unique_categories = {}
index = 0

for d in filtered_products:
    for category in d['categories']:
        if category not in unique_categories:
            unique_categories[category] = index
            index += 1

for d in filtered_products:
    d['categories'] = [unique_categories[cat] for cat in d['categories']]

with open("data/categories.json", "w") as f:
    json.dump(unique_categories, f)

filtered_products[0]

{'id': '0594459451',
 'categories': [0, 1, 2],
 'brand': 'Barnes &amp; Noble',
 'features': ["BUY MORE AND SAVE! Purchase 2 of this Item and SAVE 25% Buy 3 SAVE 30% Buy 4 SAVE 32% Here's how (restrictions apply)"],
 'price': 6.04,
 'date': '20140801'}

In [8]:
unique_features = {}
index = 0

for d in filtered_products:
    for feature in d['features']:
        if feature not in unique_features:
            unique_features[feature] = index
            index += 1

for d in filtered_products:
    d['features'] = [unique_features[feat] for feat in d['features']]

with open("data/features.json", "w") as f:
    json.dump(unique_features, f)

filtered_products[0]

{'id': '0594459451',
 'categories': [0, 1, 2],
 'brand': 'Barnes &amp; Noble',
 'features': [0],
 'price': 6.04,
 'date': '20140801'}

In [9]:
products_to_keep = set(p["id"] for p in filtered_products)
filtered_ratings = [f for f in filtered_ratings if f["product"] in products_to_keep]

len(filtered_ratings), len(filtered_products)

(3461917, 69379)

In [10]:
with open("data/ratings.json", "w") as f:
    json.dump(filtered_ratings, f)