In [1]:
from datetime import datetime
from tqdm import tqdm
import re
from bs4 import BeautifulSoup

def transform_and_filter(r):
    if r.get('verified') != True:
        return None

    new_r = r.copy()

    new_r['reviewTime'] = datetime.strptime(r['reviewTime'], '%m %d, %Y').strftime('%Y%m%d')

    for key in ['style', 'reviewerName', 'unixReviewTime', 'verified']:
        new_r.pop(key, None)

    if not 'reviewText' in new_r:
        new_r['reviewText'] = ''

    new_r['mark'] = new_r.pop('overall', None)
    new_r['product'] = new_r.pop('asin', None)
    new_r['date'] = new_r.pop('reviewTime', None)
    new_r['user'] = new_r.pop('reviewerID', None)
    new_r['comment'] = re.sub(' +', ' ', BeautifulSoup(new_r.pop('reviewText', None), 'html.parser').get_text().strip().replace('\n', '').strip())
    new_r['title'] = new_r.pop('summary', None)

    if "vote" in new_r:
        new_r.pop("vote")
    
    if "image" in new_r:
        new_r.pop("image")

    return new_r

In [2]:
import rapidjson as json
import pandas as pd
import os

def buf_count_newlines_gen(fname):
    def _make_gen(reader):
        b = reader(2 ** 16)
        while b:
            yield b
            b = reader(2 ** 16)

    with open(fname, "rb") as f:
        count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
    return count

for file_name in os.listdir("./data/"):
    if file_name.split(".")[-1] != "json":
        continue
    
    file = f"./data/{file_name}"

    lines = buf_count_newlines_gen(file)
    data = []

    with open(file) as f:
        for i in tqdm(range(lines)):
            line = f.readline()
            
            converted_line = transform_and_filter(json.loads(line))
            if converted_line is None:
                continue

            data.append(converted_line)

    pd.DataFrame(data).drop_duplicates().to_csv(f"./data/{file_name}"[:-4] + "csv", index=False)

  new_r['comment'] = re.sub(' +', ' ', BeautifulSoup(new_r.pop('reviewText', None), 'html.parser').get_text().strip().replace('\n', '').strip())
  new_r['comment'] = re.sub(' +', ' ', BeautifulSoup(new_r.pop('reviewText', None), 'html.parser').get_text().strip().replace('\n', '').strip())
100%|██████████| 51311621/51311621 [1:22:25<00:00, 10376.42it/s] 
100%|██████████| 20994353/20994353 [26:44<00:00, 13083.38it/s]
100%|██████████| 12980837/12980837 [16:56<00:00, 12774.31it/s]


StopIteration: 

In [8]:
import csv
import os
from tqdm import tqdm

output_file_path = "./data/all_ratings.csv"

with open(output_file_path, 'w', newline='') as output_file:
    writer = csv.writer(output_file)
    
    first_file = True
    
    for file_name in tqdm(os.listdir("./data/")):
        if file_name.split(".")[-1] != "csv" or file_name == output_file_path.split("/")[-1]:
            continue

        file_path = f"./data/{file_name}"

        with open(file_path, 'r') as input_file:
            reader = csv.reader(input_file)
            
            try:
                header = next(reader)
            except StopIteration:
                print(f"{file_name} is empty. Skipping.")
                continue
            
            if first_file:
                writer.writerow(header)
                first_file = False
            
            for row in reader:
                writer.writerow(row)

  0%|          | 0/7 [00:00<?, ?it/s]

100%|██████████| 7/7 [12:35<00:00, 107.89s/it]


In [1]:
from datetime import datetime
from tqdm import tqdm
import re
from bs4 import BeautifulSoup
import pandas as pd

products_to_keep = set(pd.read_csv("./data/all_ratings.csv", usecols=["product"])["product"].values)
pattern = re.compile(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?')
date_format = '%B %d, %Y'
new_date_format = '%Y%m%d'

def extract_price(s):
    match = pattern.fullmatch(s)
    
    if match:
        return float(s.replace('$', '').replace(',', ''))
    return 0

def transform_products(p):
    asin = p["asin"]
    if not asin in products_to_keep:
        return None

    new_p = {
        "id": asin,
        "categories": p["category"],
        "brand": p["brand"],
        "features": p["feature"],
        "price": extract_price(p["price"])
    }
    
    for i in range(len(new_p["categories"])):
        new_p["categories"][i] = re.sub(' +', ' ', BeautifulSoup(new_p["categories"][i], 'html.parser').get_text().strip().replace('\n', '').strip())
    
    for i in range(len(new_p["features"])):
        new_p["features"][i] = re.sub(' +', ' ', BeautifulSoup(new_p["features"][i], 'html.parser').get_text().strip().replace('\n', '').strip())

    try:
        new_p["date"] = datetime.strptime(p["date"].strip(), '%B %d, %Y').strftime('%Y%m%d')
    except:
        new_p["date"] = datetime(1970, 1, 1).strftime('%Y%m%d')

    return new_p

In [2]:
len(products_to_keep)

4071142

In [3]:
import rapidjson as json
import pandas as pd
import os

def process_file(file):
    data = []
    ko_lines = 0
    with open(file) as f:
        for line in tqdm(f):
            transformed_data = transform_products(json.loads(line))
            if transformed_data is not None:
                data.append(transformed_data)
            else:
                ko_lines += 1

    if data:
        print(len(data), ko_lines)
        pd.DataFrame(data).drop_duplicates(subset=["id"]).to_csv(f"./data/{os.path.basename(file).split('.')[0]}.csv", index=False)

for file in [f"./data/{file_name}" for file_name in os.listdir("./data/") if file_name.endswith(".json") and "meta" in file_name]:
    print(f"Processing {file}")
    process_file(file)

Processing ./data/meta_Books.json


  new_p["features"][i] = re.sub(' +', ' ', BeautifulSoup(new_p["features"][i], 'html.parser').get_text().strip().replace('\n', '').strip())
  new_p["categories"][i] = re.sub(' +', ' ', BeautifulSoup(new_p["categories"][i], 'html.parser').get_text().strip().replace('\n', '').strip())
2934949it [05:55, 8261.17it/s] 


2438259 496690
Processing ./data/meta_Electronics.json


  new_p["features"][i] = re.sub(' +', ' ', BeautifulSoup(new_p["features"][i], 'html.parser').get_text().strip().replace('\n', '').strip())
786445it [05:10, 2529.70it/s]


738928 47517
Processing ./data/meta_Sports_and_Outdoors.json


  new_p["categories"][i] = re.sub(' +', ' ', BeautifulSoup(new_p["categories"][i], 'html.parser').get_text().strip().replace('\n', '').strip())
962300it [08:01, 2000.23it/s]


924272 38028


In [4]:
import csv
import os
from tqdm import tqdm

output_file_path = "./data/all_products.csv"

with open(output_file_path, 'w', newline='') as output_file:
    writer = csv.writer(output_file)
    
    first_file = True
    
    for file_name in tqdm(os.listdir("./data/")):
        if file_name.split(".")[-1] != "csv" or not "meta" in file_name:
            continue

        file_path = f"./data/{file_name}"

        with open(file_path, 'r') as input_file:
            reader = csv.reader(input_file)
            
            try:
                header = next(reader)
            except StopIteration:
                print(f"{file_name} is empty. Skipping.")
                continue
            
            if first_file:
                writer.writerow(header)
                first_file = False
            
            for row in reader:
                writer.writerow(row)

100%|██████████| 14/14 [00:26<00:00,  1.87s/it]


In [1]:
import pandas as pd

products = pd.read_csv("./data/all_products.csv")
products.head()

Unnamed: 0,id,categories,brand,features,price,date
0,0000092878,[],Keith Graham,[],39.94,19700101
1,000047715X,"['Books', 'New, Used & Rental Textbooks', 'Med...",Acp,[],0.0,19700101
2,0000004545,"['Books', 'Arts & Photography', 'Music']",Burkhard Jarisch,[],199.99,19700101
3,0000013765,"['Books', 'Arts & Photography', 'Music']",Stamps/Baxter,[],0.0,19700101
4,0000477141,"['Books', 'Medical Books', 'Medicine']",ACP,[],0.0,19700101


In [2]:
import json
import ast

products['categories'] = products['categories'].apply(ast.literal_eval)
products['features'] = products['features'].apply(ast.literal_eval)

all_categories = [cat for sublist in products['categories'].tolist() for cat in sublist]
all_features = [feature for sublist in products['features'].tolist() for feature in sublist]
category_dict = {k: v for v, k in enumerate(set(all_categories))}
feature_dict = {k: v for v, k in enumerate(set(all_features))}

products['categories'] = products['categories'].apply(lambda x: [category_dict[cat] for cat in x])
products['features'] = products['features'].apply(lambda x: [feature_dict[feature] for feature in x])


with open("./data/categories.json", "w") as f:
    json.dump(category_dict, f)

with open("./data/features.json", "w") as f:
    json.dump(feature_dict, f)

products.to_csv("./data/all_products_simplified.csv", index=False)
products.head()

Unnamed: 0,id,categories,brand,features,price,date
0,0000092878,[],Keith Graham,[],39.94,19700101
1,000047715X,"[89792, 105868, 122041]",Acp,[],0.0,19700101
2,0000004545,"[89792, 77019, 12065]",Burkhard Jarisch,[],199.99,19700101
3,0000013765,"[89792, 77019, 12065]",Stamps/Baxter,[],0.0,19700101
4,0000477141,"[89792, 25301, 6735]",ACP,[],0.0,19700101


In [6]:
import pandas as pd
import json

ratings = pd.read_csv("./data/all_ratings.csv", usecols=["mark", "product", "date", "user"])

users_dict = {k: v for v, k in enumerate(set(ratings['user']))}

ratings['user'] = ratings['user'].apply(lambda x: users_dict[x])

with open("./data/users.json", "w") as f:
    json.dump(users_dict, f)

ratings.to_csv("./data/all_ratings_simplified.csv", index=False)
ratings.head()

Unnamed: 0,mark,product,date,user
0,5.0,1713353,20161003,16201208
1,5.0,1713353,20160729,225610
2,5.0,1713353,20160620,18037271
3,5.0,1713353,20160424,8215727
4,5.0,1713353,20160214,8084801
