In [1]:
import requests
import pandas as pd

# 1. GET DATA THROUGH API

In [2]:
def getResponse(URL):
    # send request
    HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate",
              "Cookie": "TEST"}
    r = requests.get(url=URL, headers=HEADERS)
    # get result
    data = r.json()
    res = data["result"]
    return res

## Get list of categoryId from Uniqlo Official Website

In [3]:
category_dict_women = {
    "outwear": 23327,
    "tops": 23332,
    "bottoms": 23338,
    "dresses_jumpsuits": 23346,
    "innner_wear": 23347,
    "loungewear_home": 23356,
    "accessories_shoes": 23359,
    "sport_utility_wear": 23363,
    "maternity": 23364,
}

category_dict_men = {
    "outwear": 23377,
    "tops": 23383,
    "bottoms": 23391,
    "innner_wear": 23398,
    "loungewear_home": 23404,
    "accessories_shoes": 23407,
    "sport_utility_wear": 23412,
}

## Get list of productId for each category

In [4]:
def getProductIDs(categroyId, offset):
    # get response
    URL = "https://www.uniqlo.com/us/api/commerce/v5/en/products?path=%2C%2C" + categroyId + "&categoryId=" + categroyId + "&offset=" + offset + "&limit=36&httpFailure=true"
    res = getResponse(URL)
    # check if it's has next page
    total = res["pagination"]["total"]
    count = res["pagination"]["count"]
    hasNextPage = True if int(offset) + count < total else False
    # get productIds
    productIds = []
    for item in res["items"]:
        productIds.append(item["productId"])
    return productIds, hasNextPage

def getProductIDsByCategroy(category_dict):
    # for each categroyId, get all productIds
    ctgToPd = {} # dict: categroyId to productIds
    for key, value in category_dict.items():
        # get all productIds
        productIds = set()
        hasNextPage, offset = True, 0
        # keep fetch pages if there's next page
        while (hasNextPage):
            ids, hasNextPage = getProductIDs(str(value), str(offset))
            offset += 36 # get 36 more products
            productIds = productIds.union(ids)
        ctgToPd[value] = productIds # store all productIds in the dict
    return ctgToPd

## Get reviews for each product

In [5]:
def getProductReview(productID="E439807-000", offset="0"):
    # get response
    URL = "https://www.uniqlo.com/us/api/commerce/v5/en/products/" + productID + "/reviews?offset=" + offset + "&limit=10&sort=submission_time&httpFailure=true"
    res = getResponse(URL)
    # check if it's has next page
    total = res["pagination"]["total"]
    count = res["pagination"]["count"]
    hasNextPage = True if int(offset) + count < total else False
    # get comments and ratings
    reviews = res["reviews"]
    rating = res["rating"]
    return reviews, rating, hasNextPage 

def getReviewsByCategory(ctgToPd):
    reviews_dict = {}
    # for each categroyId, loop through all products
    for key, value in ctgToPd.items():
        categroyId, productIds = key, value
        # for each product, loop all reviews
        reviews_dict[categroyId] = {}
        for productId in productIds:
            hasNextPage, offset = True, 0
            # keep fetch pages if there's next page
            reviews = []
            rating = {}
            while(hasNextPage):
                new_reviews, new_rating, hasNextPage = getProductReview(productId, str(offset))
                offset += 10 # get 10 more products
                reviews += new_reviews
                rating = new_rating
            reviews_dict[categroyId][productId] = {"reviews": reviews, "rating": rating}
    return reviews_dict

# 2. WRITE DATA TO CSV

In [6]:
import csv 

def writeDataToCSV(reviews_dict, filename):
    rows = []
    for key, value in reviews_dict.items():
        categroyId, pdToReviews = key, value
        for key, value in pdToReviews.items():
            productId, obj = key, value
            reviews, rating = obj["reviews"], obj["rating"] 
            rating_average, rating_count, rating_fit = rating["average"], rating["count"], rating["fit"]
            rating_1, rating_2, rating_3, rating_4, rating_5 =  rating["rateCount"]["one"], rating["rateCount"]["two"], rating["rateCount"]["three"], rating["rateCount"]["four"], rating["rateCount"]["five"]
            for review in reviews:
                reviewId, comment, title, rate, fit, heightRange, weightRange, shoeSize, purchasedSize, helpfulCount, ageRange, gender, location, name, createDate, isIncentivized, userCanLike, userCanReport = review["reviewId"], review["comment"], review["title"], review["rate"], review["fit"], review["heightRange"], review["weightRange"], review["shoeSize"], review["purchasedSize"], review["helpfulCount"], review["ageRange"], review["gender"], review["location"], review["name"], review["createDate"], review["isIncentivized"], review["userCanLike"], review["userCanReport"]
                row = [reviewId, productId, categroyId, rate, fit, comment, title, 
                       rating_average, rating_count, rating_fit, rating_1, rating_2, rating_3, rating_4, rating_5, heightRange, weightRange, shoeSize, purchasedSize, helpfulCount, ageRange, gender, location, name, createDate, isIncentivized, userCanLike, userCanReport]
                rows.append(row)
                
    # field names 
    fields = ['reviewId', 'productId', 'categroyId', 'rate', 'fit', 'comment', 'title', 
                       'rating_average', 'rating_count', 'rating_fit', 'rating_1', 'rating_2', 'rating_3', 'rating_4', 'rating_5', 
                          'heightRange', 'weightRange', 'shoeSize', 'purchasedSize', 'helpfulCount', 
                      'ageRange', 'gender', 'location', 'name', 'createDate', 'isIncentivized', 'userCanLike', 'userCanReport']

    # writing to csv file 
    with open(filename, 'w') as csvfile: 
        # creating a csv writer object 
        csvwriter = csv.writer(csvfile) 
        # writing the fields 
        csvwriter.writerow(fields) 
        # writing the data rows 
        csvwriter.writerows(rows)

# 3. Create a Workflow

In [7]:
def ETL(dic, filename):
    ctgToPd = getProductIDsByCategroy(dic)
    reviews_dict = getReviewsByCategory(ctgToPd)
    writeDataToCSV(reviews_dict, filename)

ETL(category_dict_women, "uniqlo_reviews_women.csv")
ETL(category_dict_men, "uniqlo_reviews_men.csv")