## Parser for Amazon Review Dataset (for LibRec)

#### Read in review & metadata files and convert them into strict json object

In [1]:
import json

f = open("output.strict", 'w')
for l in open("Home_and_Kitchen_5.json"):
    f.write(json.dumps(eval(l)) + '\n')
f.close()

f2 = open("metas", 'w')
for l in open("meta_Home_and_Kitchen.json"):
    f2.write(json.dumps(eval(l)) + '\n')
f2.close()

In [2]:
fi = open('output.strict', 'r')
me = open('metas', 'r')

In [3]:
all_reviews = fi.readlines()
all_metas = me.readlines()
print len(all_reviews)
print len(all_metas)

551682
436988


#### Sample data

In [4]:
json.loads(all_reviews[0])

{u'asin': u'0615391206',
 u'helpful': [0, 0],
 u'overall': 5.0,
 u'reviewText': u'My daughter wanted this book and the price on Amazon was the best.  She has already tried one recipe a day after receiving the book.  She seems happy with it.',
 u'reviewTime': u'10 19, 2013',
 u'reviewerID': u'APYOBQE6M18AA',
 u'reviewerName': u'Martin Schwartz',
 u'summary': u'Best Price',
 u'unixReviewTime': 1382140800}

In [5]:
json.loads(all_metas[0]) ## contain missing data

{u'asin': u'0076144011',
 u'categories': [[u'Home & Kitchen']],
 u'imUrl': u'http://g-ecx.images-amazon.com/images/G/01/x-site/icons/no-img-sm._CB192198896_.gif',
 u'salesRank': {u'Books': 6285595},
 u'title': u'Ninjas, Piranhas, and Galileo'}

#### Metadata is a little complex since it contains missing value

##### price

In [6]:
all_prices = []
for meta in all_metas:
    meta = json.loads(meta)
    try:
        all_prices.append(meta['price'])
    except KeyError:
        continue

In [7]:
print 'minimum', min(all_prices)
print 'maximum', max(all_prices)
print 'bin size', round((max(all_prices)-min(all_prices))/4)

minimum 0.01
maximum 999.99
bin size 250.0


Bin into four bins [0.01, 250] [250.01, 500] [500.01, 750] [750.01, 1000]

##### related (also bought, also viewed, bought together, buy after viewing)

len(bought_together) > 0, bundle_product = 1

##### Categories

if an item has a category other than 'Home & kitchen', it will be given each category as a feature

##### Sales Rank

In [8]:
SR = {}
for meta in all_metas:
    meta = json.loads(meta)
    try:
        srkeys = meta['salesRank'].keys()
        for key in srkeys:
            if key not in SR:
                SR[key] = []
                SR[key].append(meta['salesRank'][key])
            else:
                SR[key].append(meta['salesRank'][key])
    except:
        continue

In [9]:
for k, vl in SR.items():
    print 'For category {}, lowest rank is {}, highest rank is {}'.format(k, max(vl), min(vl))
## rank 1 meaning top 1 seller

For category Arts, Crafts & Sewing, lowest rank is 755215, highest rank is 25
For category Jewelry, lowest rank is 1401180, highest rank is 294
For category Cell Phones & Accessories, lowest rank is 1565200, highest rank is 42627
For category Electronics, lowest rank is 845662, highest rank is 486
For category Health & Personal Care, lowest rank is 1406388, highest rank is 228
For category Home Improvement, lowest rank is 1638096, highest rank is 16
For category Beauty, lowest rank is 1134605, highest rank is 779
For category Automotive, lowest rank is 1106761, highest rank is 40
For category Video Games, lowest rank is 125061, highest rank is 6233
For category Computers & Accessories, lowest rank is 26709, highest rank is 1441
For category Grocery & Gourmet Food, lowest rank is 168482, highest rank is 572
For category Baby, lowest rank is 348512, highest rank is 140
For category Sports &amp; Outdoors, lowest rank is 2970741, highest rank is 70
For category Patio, Lawn & Garden, lowest

features from sales rank: ['top5sales', 'top100sales', 'top1000sales', 'rankedSales']

### generate features

In [10]:
features_dict = {}
features_dict['low_price'] = 0
features_dict['med_price'] = 1
features_dict['moderate_price'] = 2
features_dict['high_price'] = 3
features_dict['bundle_product'] = 4
print max(features_dict.values())+1

5


In [11]:
counter = max(features_dict.values())+1
rankcats = ['top5sales', 'top100sales', 'top1000sales', 'rankedSales']
for k, _ in SR.items():
    for cat in rankcats:
        to_add = cat+' in '+k
        features_dict[to_add] = counter
        counter += 1
print max(features_dict.values())+1

117


In [12]:
counter = max(features_dict.values())+1
for meta in all_metas:
    meta = json.loads(meta)
    try:
        categories = meta['categories']
        for i in categories[0]:
            if i != 'Home & Kitchen' and i not in features_dict:
                features_dict[i] = counter
                counter += 1
    except:
        continue

In [13]:
len(features_dict.values()) # 1088 features besides Home & Kitchen'

1205

In [14]:
featureList = open('features.txt', 'w')
all_features = {}
for k, v in features_dict.items():
    all_features[v] = k
featureList.write('FeatureID: FeatureName\n')
for k, v in all_features.items():
    to_write = str(k)+': '+v+'\n'
    featureList.write(to_write)
featureList.close()

##### simplify the userID and itemID

In [15]:
all_users = {}
all_items = {}

In [16]:
user_counter = 0
item_counter = 0
for review in all_reviews:
    review = json.loads(review)
    userId = review['reviewerID']
    item = review['asin']
    if userId not in all_users:
        all_users[userId] = user_counter
        user_counter += 1
    if item not in all_items:
        all_items[item] = item_counter
        item_counter += 1

In [17]:
userLookup = open('users.txt', 'w')
newUserDict = {}
for k, v in all_users.items():
    newUserDict[v] = k
userLookup.write('ReviewerID: ReviewerName\n')
for k, v in newUserDict.items():
    to_write = str(k)+': '+v+'\n'
    userLookup.write(to_write)
userLookup.close()

In [18]:
itemLookup = open('items.txt', 'w')
newItemDict = {}
for k, v in all_items.items():
    newItemDict[v] = k
itemLookup.write('itemID: itemName\n')
for k, v in newItemDict.items():
    to_write = str(k)+': '+v+'\n'
    itemLookup.write(to_write)
itemLookup.close()

- all_users: key is reviewerName, value is reviewerID
- all_items: key is itemName, value is itemID
- features_dict: key is feature name, value is feature ID

## Generate reviews.csv (user, item, rating)

In [19]:
review_dict = {}
for review in all_reviews:
    review = json.loads(review)
    userName = review['reviewerID']
    userID = all_users[userName]
    itemName = review['asin']
    itemID = all_items[itemName]
    rating = int(review['overall'])
    if userID not in review_dict:
        review_dict[userID] = {}
        review_dict[userID][itemID] = rating
    elif itemID not in review_dict[userID]:
        review_dict[userID][itemID] = rating

In [20]:
reviews = open('reviews.csv', 'w')
for k, itemID in review_dict.items():
    for item, rating in itemID.items():
        to_write = str(k)+','+str(item)+','+str(rating)+'\n'
        reviews.write(to_write)
reviews.close()

## Generate items.csv (item, featureID(s) separated by comma)

find featureID in features_dict
- price: low_price (<250), med_price (<500), moderate_price (<750), high_price (<1000)
- related: len(bought_together) > 0, bundle_product = 1
- categories: anything other than 'Home & Kitchen'
- salesRank: 'category name' + 'top5sales', 'top100sales', 'top1000sales', 'rankedSales'

In [21]:
itemFeatureDict = {}
for meta in all_metas:
    meta = json.loads(meta)
    try: # some item information is not provided
        # get itemID
        itemName = meta['asin']
        itemID = all_items[itemName]
        if itemID not in itemFeatureDict:
            itemFeatureDict[itemID] = []

        # generate price feature
        try:
            p = meta['price']
            if p > 0.0 and p <= 250.0:
                pricefeature = 'low_price'
            elif p > 250.0 and p <= 500.0:
                pricefeature = 'med_price'
            elif p > 500.0 and p <= 750.0:
                pricefeature = 'moderate_price'
            elif p > 750.0 and p <= 1000.0:
                pricefeature = 'moderate_price'
            itemFeatureDict[itemID].append(features_dict[pricefeature])
        except KeyError:
            pass

        # generate related feature
        try:
            _ = meta['related']['bought_together'] # if it goes through, it is > 0
            itemFeatureDict[itemID].append(features_dict['bundle_product'])
        except KeyError:
            pass

        # generate category feature
        try:
            cats = meta['categories'][0]
            for i in cats:
                itemFeatureDict[itemID].append(features_dict[i])
        except KeyError:
            pass

        # generate salesRank feature
        try:
            sr = meta['salesRank']
            for category, ranking in sr.items():
                if int(ranking) >  0 and int(ranking) <= 5:
                    rankType = 'top5sales'
                elif int(ranking) >  5 and int(ranking) <= 100:
                    rankType = 'top100sales'
                elif int(ranking) >  100 and int(ranking) <= 1000:
                    rankType = 'top1000sales'
                else:
                    rankType = 'rankedSales'
                to_look_up = rankType+' in '+category
                sr_ID = features_dict[to_look_up]
                itemFeatureDict[itemID].append(sr_ID)
        except KeyError:
            pass
    except KeyError:
        continue

In [22]:
features = open('itemFeatures.csv', 'w')
for k, featureList in itemFeatureDict.items():
    if len(featureList) > 0:
        to_write = str(k)+','
        for feature in featureList:
            to_write+=str(feature)+','
        features.write(to_write+'\n')
    else: # skip those that don't have features
        continue
features.close()