## Dataset preparation - Stage 1 - Extracting samples from Amazon Review Dataset 2018 (Cell phones and Accessories)

In [1]:
import os
import json
import gzip
import pandas as pd
import re

In [2]:
path_meta = "meta_Cell_Phones_and_Accessories.json.gz" 

# download this file from "Cell Phones and Accessories" metadata (590,269 products) Amazon dataset 2018 
# link to amazon dataset: https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/


data = []
with gzip.open(path_meta) as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print(len(data))

590071


In [3]:
data[100]

{'category': ['Cell Phones & Accessories',
  'Accessories',
  'Chargers & Power Adapters',
  'Car Chargers'],
 'tech1': '',
 'description': ['Heavy Duty Plug-In Car / Vehicle Charger for Motorola W376g Phone!'],
 'fit': '',
 'title': 'Heavy Duty Plug-In Car / Vehicle Charger for Motorola W376g Phone!',
 'also_buy': [],
 'tech2': '',
 'brand': 'Wireless Solutions',
 'feature': ['Complete your phone accessories with this quality charger at a fraction of the price!',
  'Has a two stage digital chip built in for accurate charging at fast and slow speeds.',
  'High quality ABS grade plastic means this charger will outlast the competition.',
  'Powerfull Red LED lets you know when this is charging correctly.',
  'Cord is double insulated and shielded and measures up to 6ft long.'],
 'rank': ['>#4,911,957 in Cell Phones & Accessories (See Top 100 in Cell Phones & Accessories)',
  '>#47,820 in Cell Phones & Accessories > Cell Phone Accessories > Car Accessories > Car Chargers',
  '>#128,060 in

In [4]:
# Considering first 100k samples from the dataset
new_data = []
for dic in data[:100001]:
    new_dict = {key: dic[key] for key in ['category','description','title','brand','feature','asin'] if key in dic}
    new_data.append(new_dict)

In [5]:
data = []
for d in new_data:
    if not any(isinstance(v, (str, list)) and not v for v in d.values()):
        data.append(d)

In [6]:
len(data)  # only these products have category, description, feature, title, brand,asin in them

63233

In [7]:
data[50000]

{'category': ['Cell Phones & Accessories', 'Accessories', 'Cables'],
 'description': ['The Micro-USB connector is rated for 10,000 connect-disconnect cycles which is significantly more than the mini plug design, and now being widely adopted by cellphone and PDA manufacturers. These include various types of battery chargers, allowing Micro-USB to be the single external cable link.Length 15ft1 x Type A USB1 x Type Micro USB-MaleLifetime Warranty'],
 'title': '6 Ft Feet Sync &amp; charging Micro USB Data Cable Sanyo Taho Phone (Sprint)',
 'brand': 'Importer520',
 'feature': ['Brand New Generic Bulk Package',
  '2 in 1 Sync+Charge Micro USB Cable',
  'Compatible with: Garmin GPS Device dezl 560LMT 560LT nuLink 1695 nuvi 2300 2300LM 2350 2350LMT 2350LT 2360LMT 2370LT 2450 2450LM 2460LMT 3450 3450LM 3490LMT 3750 3760LMT 3760T 3790LMT 3790T',
  'Comaptible with Kodak EasyShare Digital Camera Models: M5370 M5350 M590 M583 M577 M552 M532 M550 M530 C195 C183 M522 C1550',
  'Compatible with HP To

In [8]:
main_data = []
for dic in data:
    dic_update = {}
    des = ' '.join([f"It is a {dic['brand']} product."] + dic['description'] + dic['feature'] + [f"Categories of product are {' '.join(dic['category'])}"])
    des = re.sub(r"[^a-zA-Z0-9.!?]+", r" ", des)
    if len(des.split()) <= 400:
        dic_update['id'] = dic['asin']
        dic_update['title'] = dic['title']
        dic_update['description'] = des
        main_data.append(dic_update)

In [9]:
len(main_data)

61117

In [10]:
main_data[50000]

{'id': 'B006YW34BA',
 'title': 'EMPIRE LG DoublePlay Rubberized Hard Case Cover (Hot Pink) [EMPIRE Packaging]',
 'description': 'It is a EMPIRE product. EMPIRE s two piece snap on cases are made of highly durable plastic coated with a rubberized texture to give your device maximum protection and added grip without making it bulkier in size. Precisely engineered to fit your device the snap on case attaches easily and securely to the front and back side of it and also offers direct access to all of its features. Other colors are also available to choose from. Rubberized coated texture provides added protection against drops bumps and scratches. Engineered to fit device perfectly with cut outs to allow for full device functionality. Snap on system ensures case stays on device. Openings for full phone functionality. Constructed for strong durability and scratch resistance. EMPIRE is a registered trademark with the USPTO. Categories of product are Cell Phones Accessories Cases Holsters Slee

In [11]:
with open('Cell_Phones_and_Accessories_prep.json','w') as file:
    json.dump(main_data,file,indent=4)