## Preprocess Amazon Products Dataset to include only Books related data

The processed dataset will be written to csv format datafile.

In [21]:
!pip install porter2stemmer
!pip install nltk



In [22]:
import string
import re
import nltk
import csv
import pandas as pd

nltk.download('stopwords')

from nltk.corpus import stopwords

# from porter2stemmer import Porter2Stemmer
# stemmer = Porter2Stemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Dataset is taken from Stanford's SNAP http://snap.stanford.edu/data/amazon-meta.html. For the original Data Format, please refer to the link.

In [3]:
dataset="../data/amazon/products/products-review.txt"

In [4]:
%%time
fhr = open(dataset, 'r', encoding='utf-8', errors='ignore')

def preprocess():
    amazonProducts = {}
    (Id, ASIN, Title, Categories, Group, Copurchased, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff) = ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0)
    
    for line in fhr:
        line = line.strip()
        
        if(line.startswith("Id")):
            Id = line[3:].strip()
        elif(line.startswith("ASIN")):
            ASIN = line[5:].strip()
        elif(line.startswith("title")):
            Title = line[6:].strip()
            # Title = ' '.join(Title.split())
        elif(line.startswith("group")):
            Group = line[6:].strip()
        elif(line.startswith("salesrank")):
            SalesRank = line[10:].strip()
        elif(line.startswith("similar")):
            ls = line.split()
            Copurchased = ' '.join([c for c in ls[2:]])
        elif(line.startswith("categories")):
            ls = line.split()
            Categories = ' '.join((fhr.readline()).lower() for i in range(int(ls[1].strip())))
            Categories = re.compile('[%s]' % '\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\^_\`\{\|\}\~').sub(' ', Categories)
            Categories = re.compile('[%s]' % re.escape(string.digits+string.punctuation)).sub(' ', Categories)
            Categories = ' '.join(set(Categories.split())-set(stopwords.words("english")))
#             Categories = ' '.join(stemmer.stem(word) for word in Categories.split())
        elif(line.startswith("reviews")):
            ls = line.split()
            TotalReviews = ls[2].strip()
            AvgRating = ls[2].strip()
        elif(line == ""):
            try:
                MetaData = {}
                if (ASIN != ""):
                    amazonProducts[ASIN] = MetaData
                MetaData['ASIN'] = ASIN
                MetaData['Id'] = Id
                MetaData['Title'] = Title
                MetaData['Categories'] = ' '.join(set(Categories.split()))
                MetaData['Group'] = Group
                MetaData['Copurchased'] = Copurchased
                MetaData['SalesRank'] = int(SalesRank) 
                MetaData['TotalReviews'] = int(TotalReviews)
                MetaData['AvgRating'] = float(AvgRating)
                MetaData['DegreeCentrality'] = DegreeCentrality
                MetaData['ClusteringCoeff'] = ClusteringCoeff
            except NameError:
                continue
            (Id, ASIN, Title, Categories, Group, Copurchased, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff) = ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0)
    
    return amazonProducts
    fhr.close()

CPU times: user 2min 18s, sys: 11 s, total: 2min 29s
Wall time: 2min 29s


In [None]:
amazonProducts = preprocess()

In [17]:
def filter_books():
    amazonBooks = {}
    for asin, metadata in amazonProducts.items():
        if(metadata['Group'] == 'Book'):
            amazonBooks[asin] = amazonProducts[asin]
    return amazonBooks

def remove_startwith_b(amazonBooks):
    for asin, metadata in list(amazonBooks.items()):
        if asin.startswith('B'):
            del amazonBooks[asin]
    return amazonBooks

def remove_non_books_copurchase(amazonBooks):
    for asin, metadata in amazonBooks.items():
        amazonBooks[asin]['Copurchased'] = ' '.join([cp for cp in metadata['Copurchased'].split() if cp in amazonBooks.keys()])
    return amazonBooks


In [19]:
amazon_books = filter_books()
amazon_books = remove_startwith_b(amazon_books)
amazon_books_refined = remove_non_books_copurchase(amazon_books)

In [20]:
file = csv.writer(open("amazon-books-v2.0.csv", "w"))

def write_header_to_csv():
    file.writerow(["Id", "ASIN", "Title", "Categories", "Group", "Copurchased", 
                   "SalesRank", "TotalReviews", "AvgRating"])
    
def write_to_csv(content):
    for asin, metadata in content.items():
        file.writerow([metadata["Id"], metadata["ASIN"], metadata["Title"], metadata["Categories"],
                  metadata["Group"], metadata["Copurchased"], metadata["SalesRank"], metadata["TotalReviews"],
                  metadata["AvgRating"]])

write_header_to_csv()
write_to_csv(amazon_books_refined)