Let's download our first data file via S3. You're going to save it to your drive. Save it in a folder for this project.

**Important: you need to 'esacape' spaces in your save path by using the \ character**

In [None]:
!wget http://128.138.93.164/meta_Clothing_Shoes_and_Jewelry.json.gz -P /content/drive/MyDrive/MSDS_marketing_text_analytics/master_files/2_topic_modeling

--2021-12-30 16:40:05--  http://128.138.93.164/meta_Clothing_Shoes_and_Jewelry.json.gz
Connecting to 128.138.93.164:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 279748879 (267M) [application/octet-stream]
Saving to: ‘/content/drive/MyDrive/1Advanced Advertising Analytics/1colab_notebooks_spring2021/2_topic_modeling/meta_Clothing_Shoes_and_Jewelry.json.gz’


2021-12-30 16:40:11 (42.7 MB/s) - ‘/content/drive/MyDrive/1Advanced Advertising Analytics/1colab_notebooks_spring2021/2_topic_modeling/meta_Clothing_Shoes_and_Jewelry.json.gz’ saved [279748879/279748879]



In [None]:
!gzip -d /content/drive/MyDrive/MSDS_marketing_text_analytics/master_files/2_topic_modeling/2_topic_modeling/meta_Clothing_Shoes_and_Jewelry.json.gz

In [None]:
import pickle
import json
from time import sleep

##this assigns the filename we're trying to load in to a string variable
working_directory = '/content/drive/MyDrive/MSDS_marketing_text_analytics/master_files/2_topic_modeling'
working_file = '%s/meta_Clothing_Shoes_and_Jewelry.json' % working_directory
loadedjson = open(working_file, 'r')

In [None]:
#The data used in this script comes from: http://jmcauley.ucsd.edu/data/amazon/links.html
#The data here is the 'per category' data for Clothing, Shoes and Jewelry
#use the above url to better understand the data, where it came from, and some
#tips on how to use it!

#getting reviews is going to be a two step process:
#1) go through the amazon product catalog for "Clothing, Shoes and Jewelery
#and extract out matching products by their ASIN
#2) go through the review data and parse out the matching reviews by ASIN

#1) - Extracting ASINs by brand
#First, let's iterate through the data and store it as a python dictionary

#let's set a counter to see how many products we have in the json
count = 0
#loading the json file
#we've always got to initiate dictionaries before we can use them
allproducts = {}

#each line of data here is a product and its metadata
for aline in loadedjson:
    #creating a counter to know our progress in processing the entire catalog
    count += 1
    if count % 100000 == 0:
        #we're only going to print our count every 100k, this way we don't spam
        #our output console
        print(count)
    #interestingly enough, this data isn't true JSON, instead it's python
    #dictionaries that have essentially been printed as text. It's odd, but if
    #we read the documentaion, all we need to do to load a dictionary is use
    #the eval function. https://www.programiz.com/python-programming/methods/built-in/eval
    #eval takes whatever string is passed to it, and interprets it as python code
    #and runs it. So here, it's exactly what we need to interpret a printed
    #python dictionary

    aproduct = eval(aline)

    #making a dictionary entry with the ASIN of the product as the key 
    #and it's metadata as nested dictionaries
    allproducts[aproduct['asin']] = aproduct

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000


In [None]:
len(allproducts)

1503384

In [None]:
#Next we need to explore the product data to see what categories are common in the
#data. As you'll learn, product categories are wishywashy in that they can be
#product categories (e.g., baby, house and home), or they can be brands!
#We're already dealing with a subset of the product categories, Clothing, Shoes
#and Jewlery. We still need to find a list of product ids for our specific
#brand. To do this,We're going to use the 'categories' metadata field to find 
#your brand

##Let's create a dictionary of all the product subcategories 
#and by doing so, also come up with a list of brands and the number of products
#they have listed in the amazon product catalog

allcategories = {}
count = 0

for aproduct in allproducts:
    #creating a counter to know our progress in processing the entire catalog
    count += 1
    if count % 100000 == 0:
        #we now know there are 1.5 million products, so we can build a counter
        #that tells how our processing is going. When the counter reaches one
        #we're done!
        print(count/1503384)
    #setting a dict up with just one product, so we can inspect and ref it  
    aproduct = allproducts[aproduct]
    #creating a dictionary entry for each product category
    #also counting the occurances of each category
    if 'categories' in aproduct:
        for categories in aproduct['categories']:
            for acategory in categories:
                if acategory in allcategories:
                    allcategories[acategory] += 1
                if acategory not in allcategories:
                    allcategories[acategory] = 1




0.06651660520532346
0.13303321041064692
0.19954981561597038
0.26606642082129384
0.33258302602661727
0.39909963123194075
0.4656162364372642
0.5321328416425877
0.5986494468479111
0.6651660520532345
0.7316826572585581
0.7981992624638815
0.8647158676692049
0.9312324728745284
0.9977490780798518


In [None]:
sortedlist = []
for acategory in allcategories:
  sortedlist.append((allcategories[acategory],acategory))

sortedlist = sorted(sortedlist, reverse=True)

for item in range(0,50):
  print(sortedlist[item])

(3429257, 'Clothing, Shoes & Jewelry')
(1086181, 'Women')
(617092, 'Clothing')
(541681, 'Men')
(537761, 'Novelty, Costumes & More')
(432653, 'Shoes')
(339900, 'Novelty')
(268065, 'Shoes & Accessories: International Shipping Available')
(255454, 'Jewelry')
(174962, 'Accessories')
(97095, 'Girls')
(93596, 'Tops & Tees')
(87688, 'Dresses')
(84549, 'T-Shirts')
(82063, 'Boots')
(80302, 'Shirts')
(79897, 'Sandals')
(79545, 'Watches')
(77684, 'Boys')
(73507, 'Jewelry: International Shipping Available')
(72372, 'Athletic')
(71414, 'Wrist Watches')
(70335, 'Sports & Outdoors')
(59763, 'Petite')
(58350, 'Fashion')
(53826, 'Costumes & Accessories')
(53021, 'Earrings')
(51728, 'Baby')
(50943, 'Comfort Shoes')
(50662, 'Casual')
(50357, 'Boot Shop')
(50124, 'C')
(49599, 'Active')
(49491, 'Band & Music Fan')
(46004, 'New Arrivals')
(43722, 'Necklaces & Pendants')
(43410, 'Intimates')
(43100, 'S')
(41709, 'Lingerie, Sleep & Lounge')
(41542, 'Handbags & Wallets')
(41400, 'Rings')
(40832, "Women's Luxur

In [None]:
allcategories['Nike']

8327

In [None]:


#Now, go ahead and use the Variable Expolorer in Spyder to locate a brand
#that has a lot of product entries! Alternatively, type allcategories['Brand name']
#to get a count for a specific brand. For instance:
#>>allcategories['Nike']
#>> 8327                    
#>>allcategories['adidas']
#>> 8645                    
                    
#I'd reccommend at least 1.5k products, but you're welcome to try smaller counts
#all I care about is whether you have at least 2k reviews when it's all said and done


##Now we need to go through our newly first dictionary and extract out the
##matching ASINs for Nike

##First, create a set where we will store our ASINs
##We choose a set here because we don't want duplicates
allnikeasins = set()
count = 0

for areview in allproducts:
    theproduct = allproducts[areview]
    count += 1
    if count % 100000 == 0:
        print(count/1503384)

    #let's iterate fore each category for a product, again, any given product 
    #can be assigned multiple product categories,
    for categories in theproduct['categories']:
        #each category is actually encoded as a list (even though they should
        #just be strings, so we need to iterate one more time)
        for acategory in categories:
            #checking to see if the product category matches Nike
            #lowercasing the category string incase capitalization might get
            #in the way of a match
            if 'nike' in acategory.lower():
                #let's go ahead and store it to our set of Nike ASINs
                allnikeasins.add(theproduct['asin'])
                
print(len(allnikeasins))



0.06651660520532346
0.13303321041064692
0.19954981561597038
0.26606642082129384
0.33258302602661727
0.39909963123194075
0.4656162364372642
0.5321328416425877
0.5986494468479111
0.6651660520532345
0.7316826572585581
0.7981992624638815
0.8647158676692049
0.9312324728745284
0.9977490780798518
8327


In [None]:
"""
This is a good break point. Let's write the ASINs out to a file so we can
use them in the next segment to extract product reviews.
"""
outputfile = open('%s/allasins.txt' % working_directory, 'w')

outputfile.write(','.join(allnikeasins))
outputfile.close()