In [None]:
import os
import sys
import json
import time
from typing import Tuple

# Project imports
sys.path.append(os.getcwd())
from src.py.utils.trash import move_to_trash, get_trash_path

In [None]:
path_output_root = "data/keepa/products/domains"
# domain_id: Amazon domain ID - Valid values: [ 1: com | 2: co.uk 231 | 3: de | 4: fr | 5: co.jp | 6: ca | 8: it | 9: es | 10: in | 11: com.mx ]
domains_map = {
	"1": "com",
	"2": "co.uk",
	"3": "de",
	"4": "fr",
	"5": "co.jp",
	"6": "ca",
	"8": "it",
	"9": "es",
	"10": "in",
	"11": "com.mx",
}
domains = sorted(os.listdir(path_output_root)) # type: ignore
domains = {domain: {"path": os.path.join(path_output_root, domain)}for domain in domains} # type: dict
path_parsed_products = "data/scraped/camel/parsed-products.json"

print(f"Domains: {json.dumps(domains, indent=2)}")

In [None]:
def get_all_json_files(path: str) -> dict:
	json_files = {}
	for root, dirs, files in os.walk(path):
		for file in files:
			if file.endswith(".json"):
				json_files[file.split(".")[0]] = os.path.join(root, file)
	return json_files

for domain in domains:
	domains[domain]["json_files"] = get_all_json_files(domains[domain]["path"])
	print(f"Domain {domain} got {len(domains[domain]['json_files'])} json files")

print("")
print(f"First json_files key value pair of domain 1: {list(domains['1']['json_files'].items())[0]}")

In [None]:
def get_failed_files(json_files: dict) -> dict:
	failed_files = {}
	for i, (asin, filepath) in enumerate(json_files.items()):
		print(f"Checking file {i+1}/{len(json_files)}: {asin}             ", end="\r")
		try:
			with open(filepath) as f:
				data = json.load(f)
				if "error" in data:
					failed_files[asin] = data["error"]
		except Exception as e:
			failed_files[asin] = f"File parsing error: {e}"
	return failed_files

for i, domain_id in enumerate(domains.keys()):
	print(f"Checking domain {i+1}/{len(domains)}: amazon.{domains_map[domain_id]}")
	domains[domain_id]["failed_files"] = get_failed_files(domains[domain_id]["json_files"])
	print("")
	print(f"Domain amazon.{domains_map[domain_id]} got {len(domains[domain_id]['failed_files'])} failed files")
	print("")

In [None]:
# Remove failed files from the json_files dict for each domain

for domain in domains:
	for asin in domains[domain]["failed_files"]:
		for domain_del in domains:
			if asin in domains[domain_del]["json_files"]:
				del domains[domain_del]["json_files"][asin]


# Print count of asins for each domain
for domain_id, domain in domains.items():
	print(f"Domain amazon.{domains_map[domain_id]} has {len(domain['json_files'])} asins remaining")
print("")

# Verify that all domains have the same asins - check intersection of asins
og_set = set(domains[list(domains.keys())[0]]["json_files"].keys())

for domain_id, domain in domains.items():
	if set(domain["json_files"].keys()) != og_set:
		print(f"Domain {domains_map[domain_id]} has different asins than the first domain")
	else:
		print(f"Domain amazon.{domains_map[domain_id]} has the same asins as the first domain - OK")

In [None]:
def parse_files(files: list) -> dict:
	'''
		Returns a dictionary with different objects for analyzing the categories of products quickly
	'''
	# category_id: int, product_id: str (ASIN)
	category_map = {} # category_id -> category_name # ; all categories
	category_tree = {} # category_id -> { subcategory_id -> ... } ; # tree of all categories
	category_products = {} # category_id -> { product_id -> True } ; # from product["categories"]
	category_products_leafs = {} # category_id -> { product_id -> True } ; # only leaf categories (last element in category_tree)
	products_categories = {} # product_id -> {category_id -> True} # uses root categories
	products = {} # product_id -> { product_data } # shrunk down product object data
	product_types = {} # type_id (string) -> { product_id -> True } # from product["type"]

	failed_files = []
	for i, file in enumerate(files):
		print(f"Processing file {i+1}/{len(files)} ({file})\r", end="")
		try:
			data = {}
			with open(file, "r") as f:
				data = json.load(f)
			product = data["products"][0] # we assume there is only one product per file as we query the API by ASIN
			asin = product["asin"]
			product_root_category = product["rootCategory"]
			product_categories = product["categories"]
			product_category_tree = product["categoryTree"]
			leaf_category = product_category_tree[-1]["catId"]
			product_type = product["type"]
			products[asin] = {
				"file": file,
				"asin": asin,
				"title": product["title"],
				"type": product_type,
				"root_category": product_root_category,
				"leaf_category": leaf_category,
				"categories": product_categories,
				"category_tree": product_category_tree,
			}

			# Process product types
			if product_type not in product_types:
				product_types[product_type] = {}
			product_types[product_type][asin] = True

			# Use leaf category to fill category_products_leafs
			# These are the actual specific categories that we want to use for filtering
			if leaf_category not in category_products_leafs:
				category_products_leafs[leaf_category] = {}
			category_products_leafs[leaf_category][asin] = asin

			
			# Process rootCategory first
			if product_root_category not in category_map:
				category_map[product_root_category] = product_root_category # if we don't know the name, we use the id still
			
			# Process categories
			for category in product_categories:
				if category not in category_map:
					category_map[category] = category # if we don't know the name, we use the id still
				if category not in category_products:
					category_products[category] = {}
				category_products[category][asin] = products[asin]
				if category not in products_categories:
					products_categories[asin] = {}
				products_categories[asin][category] = True
			
			# Fill category tree
			# the final category seems to coincide with the first element in "categories" - should we only use this
			# to match products so that a single product can only be in one "main" category?
			current_tree_level = category_tree # set it to the root
			for category_tree_object in product_category_tree:
				category_id = category_tree_object["catId"]
				category_name = category_tree_object["name"]
				# set name if not set yet or if its value is the id
				if category_id not in category_map or category_map[category_id] == category_id:
					category_map[category_id] = category_name
				# add it to the tree if not set yet
				if category_id not in current_tree_level:
					current_tree_level[category_id] = {}
				# go one level deeper
				current_tree_level = current_tree_level[category_id]
		except Exception as e:
			failed_files.append((file, e))
	return_object = {
		"category_map": category_map,
		"category_tree": category_tree,
		"category_products": category_products,
		"category_products_leafs": category_products_leafs,
		"products_category": products_categories,
		"product_types": product_types,
		"failed_files": failed_files,
		"products": products
	}
	print("") # to clean up the line
	return return_object

domains_parsed = {}

for i, domain_id in enumerate(domains):
	print(f"Parsing domain {i+1}/{len(domains)}: amazon.{domains_map[domain_id]}")
	time_start = time.time()
	domains_parsed[domain_id] = parse_files(list(domains[domain_id]["json_files"].values()))
	time_end = time.time()
	print(f"Domain amazon.{domains_map[domain_id]} parsed in {time_end - time_start:.2f} seconds")


In [None]:
# Print the number of failed files for each domain
for domain_id, domain in domains_parsed.items():
	print(f"Domain amazon.{domains_map[domain_id]} got {len(domain['failed_files'])} failed files")

In [None]:
# Select the domain with the fewest failed files for category analysis
domain_id = min(domains_parsed, key=lambda x: len(domains_parsed[x]["failed_files"]))
print(f"Selected domain amazon.{domains_map[domain_id]} for category analysis")

parsed_data = domains_parsed[domain_id]

# NOTE: Failed files in domains_parsed could still contain valid CSV data but have some missing fields
#       which may not be essential for category analysis and index creation.

In [None]:
# Recursive count using the category_tree object
# using depth first search
def count_products_in_category(parsed_data: dict, category_id: int, current_node: dict = None) -> int:
	'''
		Returns the number of products in the given category (including subcategories) using the category_tree object
		from the parsed_data object.
		The category_id can be any part of the tree.
	'''
	# 1. find the node with the given category_id
	# 2. count the number of products for all subcategories
	# 3. return the sum of the counts
	# 4. if no subcategories, return the number of products for the given category
	# 5. if category_id is not in the tree, return -1
	# NOTE: category_id can be any part of the tree
	if current_node is None:
		current_node = parsed_data["category_tree"]
	if category_id in current_node: # we found the node from which we start counting
		total_count = 0
		if category_id in parsed_data["category_products"]: # if not we assume it's a root category and we don't count it
			total_count += len(parsed_data["category_products"][category_id])
		for subcategory_id in current_node[category_id]:
			total_count += count_products_in_category(parsed_data, subcategory_id, current_node[category_id])
		return total_count
	for subcategory_id in current_node:
		count = count_products_in_category(parsed_data, category_id, current_node[subcategory_id])
		if count != -1:
			return count
	return -1


def print_basic_statistics(parsed_data: dict):
	print(f"Number of categories: {len(parsed_data['category_map'])}")
	print(f"Number of products: {len(parsed_data['products_category'])}")
	print(f"Number of root categories: {len(parsed_data['category_tree'])}")
	# print root categories in human readable format
	for root_category_id in parsed_data["category_tree"]:
		# print(f" - {parsed_data['category_map'][root_category_id]}: {len(parsed_data['category_products'][root_category_id])}     (catId={root_category_id})")
		# we need a different way to print the product counts as we exclude root categories that are not in "categories" for any products
		# da a recursive count
		count = count_products_in_category(parsed_data, root_category_id)
		print(f" - {parsed_data['category_map'][root_category_id]}: {count}    (catId={root_category_id})")
	print("")
	print(f"Number of failed files: {len(parsed_data['failed_files'])}")
	# if len(parsed_data["failed_files"]) > 0:
	# 	for failed_file in parsed_data["failed_files"]:
	# 		print(f" - '{failed_file[0]}' failed because:  '{failed_file[1]}'")

print_basic_statistics(parsed_data)


In [None]:
def print_category_tree(parsed_data: dict, level: int = 0):
	'''
		Prints category tree in a readable format (substitutes category ids with category names)
	'''
	for category_id, subcategories in parsed_data["category_tree"].items():
		# indent_char = "\t"
		indent_chat = "    "
		indent = indent_chat * level
		print(indent + parsed_data["category_map"][category_id])
		print_category_tree({"category_tree": subcategories, "category_map": parsed_data["category_map"]}, level+1)

# Best inspected in a text editor (tab sizes here are 8, so it looks weird in Jupyter notebook cell output)
# print_category_tree(parsed_data)
# should just use the bottom function to print the tree as json with substitute names

In [None]:
def substitute_category_ids(parsed_data: dict) -> dict:
	'''
		Returns a category tree with human readable category names instead of category ids
	'''
	category_map = parsed_data["category_map"]
	category_tree = parsed_data["category_tree"]
	category_tree_substituted = {}
	for category_id, subcategories in category_tree.items():
		category_tree_substituted[category_map[category_id]] = substitute_category_ids({"category_map": category_map, "category_tree": subcategories})
	return category_tree_substituted


# Basic JSON print of the category tree
print(json.dumps(substitute_category_ids(parsed_data), indent=2))

In [None]:
# Print number of products for each category sorted by number of products
for category_id, products in sorted(parsed_data["category_products"].items(), key=lambda x: len(x[1]), reverse=True):
	print(f"{parsed_data['category_map'][category_id]}: {len(products)}")


In [None]:
# Get the number of products for each root category
def get_root_category_products_count(parsed_data: dict) -> dict:
	'''
		Returns a dictionary with root category ids as keys and number of products as values
	'''
	root_category_products_count = {}
	for asin, product in parsed_data["products"].items():
		root_category_id = product["root_category"]
		if root_category_id not in root_category_products_count:
			root_category_products_count[root_category_id] = 0
		root_category_products_count[root_category_id] += 1
	return root_category_products_count

root_category_products_count = get_root_category_products_count(parsed_data)
root_category_products_count = {parsed_data["category_map"][k]: v for k, v in root_category_products_count.items()}
print(f"{json.dumps(root_category_products_count, indent=2)}")

In [None]:
def get_root_category_files(parsed_data: dict, root_category_id: int) -> list:
	'''
	Returns a list of file paths of products in the given root category
	'''
	files = []
	for asin, product in parsed_data["products"].items():
		if product["root_category"] == root_category_id:
			files.append(product["file"])
	return files

# Select only "Electronics" root category
electronics_root_category_id = 172282
electronics_files = get_root_category_files(parsed_data, electronics_root_category_id)
print(f"Electronics root category has {len(electronics_files)} products")

In [None]:
# Count the number of root categories for electronics
electronics_parsed_data = parse_files(electronics_files)
print_basic_statistics(electronics_parsed_data)

In [None]:
print(json.dumps(substitute_category_ids(parsed_data), indent=2))

In [None]:
def print_product_types_count(product_types: dict):
	'''
		Prints the number of products per type
	'''
	print("Product types:")
	counts = {}
	for product_type in product_types:
		# print(f"{product_type}: {len(product_types[product_type])}")
		counts[product_type] = len(product_types[product_type])
	counts = {k: v for k, v in sorted(counts.items(), key=lambda item: item[1], reverse=True)}
	for product_type in counts:
		print(f"{product_type}: {counts[product_type]}")
	
print_product_types_count(parsed_data["product_types"])

# Conclusions

* we could pretty much rely almost entirely on product types and then further split by product leaf category for certain product types - for example COMPUTER_DRIVE_OR_STORAGE could be split by product leaf category to get separate insights for HDD, SSD, etc.

In [None]:
def get_product_types_count(product_types: dict, minimum_count: int) -> dict:
	'''
	Returns a dictionary with product types as keys and number of products as values
	'''
	counts = {}
	for product_type in product_types:
		counts[product_type] = len(product_types[product_type])
	counts = {k: v for k, v in sorted(counts.items(), key=lambda item: item[1], reverse=True)}
	counts = {k: v for k, v in counts.items() if v >= minimum_count}
	return counts

product_types_count = get_product_types_count(parsed_data["product_types"], 100) # put 0 to get all types
print(f"{json.dumps(product_types_count, indent=2)}")

In [None]:
# Filter out product types
product_types_filter = [
	None,
	"KEYBOARD_MOUSE_SET",
]

for product_type in product_types_filter:
	if product_type in product_types_count:
		del product_types_count[product_type]

# print(f"{json.dumps(product_types_count, indent=2)}")

acccepted_product_types = list(product_types_count.keys())
print(f"Accepted product types: {json.dumps(acccepted_product_types, indent=2)}")

In [None]:
# Split the products with type "COMPUTER_DRIVE_OR_STORAGE" into categories using the leaf category
def split_products_by_leaf_category(product_type: dict, products: dict) -> dict:
	'''
		Returns a dictionary with leaf category ids as keys and a dictionary of product ASINs as values
	'''
	leaf_category_products = {}
	for asin in product_type:
		leaf_category = products[asin]["leaf_category"]
		if leaf_category not in leaf_category_products:
			leaf_category_products[leaf_category] = {}
		leaf_category_products[leaf_category][asin] = True
	return leaf_category_products

def print_leaf_category_products(leaf_category_products: dict, products: dict, category_map: dict, leaf_category_name: str, product_types_count: int):
	'''
		Prints the leaf category product counts for a given product type
	'''
	print(f"Leaf categories and their product counts for product type '{leaf_category_name}' ({len(leaf_category_products)} categories, {product_types_count} products total):")
	for leaf_category in leaf_category_products:
		print(f"{category_map[leaf_category]} (catId={leaf_category}) has {len(leaf_category_products[leaf_category])} products")
		# for asin in leaf_category_products[leaf_category]:
		# 	print(f" - {products[asin]['title']}")

computer_drive_or_storage_categories = split_products_by_leaf_category(parsed_data["product_types"]["COMPUTER_DRIVE_OR_STORAGE"], parsed_data["products"])
print_leaf_category_products(computer_drive_or_storage_categories, parsed_data["products"], parsed_data["category_map"], "COMPUTER_DRIVE_OR_STORAGE", len(parsed_data["product_types"]["COMPUTER_DRIVE_OR_STORAGE"]))

In [None]:
# What exactly is "Internal Components" (catId=17923671011) which has 1148 products?
for i, asin in enumerate(computer_drive_or_storage_categories[17923671011]):
	# if i >= 10:
	# 	print("...")
	# 	break
	product = parsed_data['products'][asin]
	print(f"{asin}: {product['title']}")

In [None]:
hmm = parsed_data["products"]["B0978VB5TF"]
print(hmm["title"])
print(hmm["file"])
print(parsed_data["category_map"][hmm["leaf_category"]])
print("")
for category in hmm["categories"]:
	print(parsed_data["category_map"][category])

In [None]:
# Looks like we can still maybe separate "Internal Components" into "Internal Solid State Drives" and "Internal Hard Drives"

In [None]:
# Try getting categories for all products in the "Internal Components" category for the "COMPUTER_DRIVE_OR_STORAGE" type
# and see if we can separate them into "Internal Solid State Drives" and "Internal Hard Drives" or something else

def get_category_counts_for_17923671011(computer_drive_or_storage_categories: dict, parsed_data: dict):
	'''
		Returns a dictionary with category ids as keys and a dictionary of product ASINs as values
	'''
	category_counts = {}
	for asin in computer_drive_or_storage_categories[17923671011]:
		product = parsed_data["products"][asin]
		for category in product["categories"]:
			if category not in category_counts:
				category_counts[category] = {}
			category_counts[category][asin] = True
	return category_counts

category_counts = get_category_counts_for_17923671011(computer_drive_or_storage_categories, parsed_data)
for category in category_counts:
	print(f"{parsed_data['category_map'][category]} (catId={category}) has {len(category_counts[category])} products")

In [None]:
# Loop over again and return the dictionary with ASINs if product has cateogory 1292116011 (Internal Solid State Drives) or 1254762011 (Internal Hard Drives)

def get_disk_categories_from_internal_components(computer_drive_or_storage_categories: dict, parsed_data: dict) -> dict:
	'''
		Returns a dictionary with category ids as keys and a dictionary of product ASINs as values
	'''
	disk_categories = {
		1254762011: {}, # Internal Hard Drives
		1292116011: {}, # Internal Solid State Drives
	}
	for asin in computer_drive_or_storage_categories[17923671011]:
		product = parsed_data["products"][asin]
		for category in product["categories"]:
			if category in disk_categories:
				disk_categories[category][asin] = True
	return disk_categories
	
newly_split_internal_components = get_disk_categories_from_internal_components(computer_drive_or_storage_categories, parsed_data)
for category in newly_split_internal_components:
	print(f"{parsed_data['category_map'][category]} (catId={category}) has {len(newly_split_internal_components[category])} products")

In [None]:
# Add these back to the pool of categories for the "COMPUTER_DRIVE_OR_STORAGE" type using computer_drive_or_storage_categories
# and remove the "Internal Components" category

# computer_drive_or_storage_categories[1254762011] = newly_split_internal_components[1254762011]

computer_drive_or_storage_categories_fixed = {}
for category in computer_drive_or_storage_categories:
	if category != 17923671011:
		computer_drive_or_storage_categories_fixed[category] = computer_drive_or_storage_categories[category]
for category in newly_split_internal_components:
	for asin in newly_split_internal_components[category]:
		computer_drive_or_storage_categories_fixed[category][asin] = True

# Print new counts
print_leaf_category_products(computer_drive_or_storage_categories_fixed, parsed_data["products"], parsed_data["category_map"], "COMPUTER_DRIVE_OR_STORAGE", len(parsed_data["product_types"]["COMPUTER_DRIVE_OR_STORAGE"]))

# Conclusions

* we have successfully split the data for internal components using internal SSD and HDD category ids - if we look up the counts, we have reduced the number of categories by 1 while the number of SSD and HDD increased by the number of those identified from internal components

# TODO

* edit the types for COMPUTER_DRIVE_OR_STORAGE in such a way that we remove that category and add specific subcategories:
	* Internal Solid State Drives (catId=1292116011) has 1659 products
	* Internal Hard Drives (catId=1254762011) has 1284 products
	* External Hard Drives (catId=595048) has 316 products
	* External Solid State Drives (catId=3015429011) has 126 products

In [None]:
# Group products by type using the new split categories for "COMPUTER_DRIVE_OR_STORAGE"

# remove COMPUTER_DRIVE_OR_STORAGE from accepted_product_types
accepted_product_types = [x for x in accepted_product_types if x != "COMPUTER_DRIVE_OR_STORAGE"]

# add the new categories to the start of the list
to_add = [
	"Internal Solid State Drives",
	"Internal Hard Drives",
	"External Hard Drives",
	"External Solid State Drives",
]
for i in range(len(to_add)):
	to_add[i] = to_add[i].upper().replace(" ", "_")

print(json.dumps(to_add, indent=2))


In [None]:
# * Internal Solid State Drives (catId=1292116011) has 1659 products
# * Internal Hard Drives (catId=1254762011) has 1284 products
# * External Hard Drives (catId=595048) has 316 products
# * External Solid State Drives (catId=3015429011) has 126 products
accepted_product_types_added = to_add + accepted_product_types
accepted_product_types_dict = {x: -1 for x in accepted_product_types_added}
accepted_product_types_dict["INTERNAL_SOLID_STATE_DRIVES"] = 1292116011
accepted_product_types_dict["INTERNAL_HARD_DRIVES"] = 1254762011
accepted_product_types_dict["EXTERNAL_HARD_DRIVES"] = 595048
accepted_product_types_dict["EXTERNAL_SOLID_STATE_DRIVES"] = 3015429011
print(json.dumps(accepted_product_types_dict, indent=2))


In [None]:
# Construct a new dictionary with the new categories
# keys are types or categories and values are modified dictionaries of products from parsed_data

def get_products_for_categories(parsed_data: dict, storage_categories: dict, accepted_product_types: dict) -> dict:
	'''
		Returns a dictionary with category ids as keys and a dictionary of product ASINs as values
	'''
	products_for_categories = {}
	for product_type, category_id in accepted_product_types.items():
		products_for_categories[product_type] = {}
		if category_id != -1: # if it's a split category for "COMPUTER_DRIVE_OR_STORAGE"
			for asin in storage_categories[category_id]:
				products_for_categories[product_type][asin] = parsed_data["products"][asin]
		else: # if it's a normal category from types
			for asin in parsed_data["product_types"][product_type]:
				products_for_categories[product_type][asin] = parsed_data["products"][asin]
	return products_for_categories

products_for_categories = get_products_for_categories(parsed_data, computer_drive_or_storage_categories_fixed, accepted_product_types_dict)
total_products = 0
print("Products for constructed categories:")
for product_type in products_for_categories:
	print(f"{product_type}: {len(products_for_categories[product_type])}")
	total_products += len(products_for_categories[product_type])
print("")
print(f"Total products: {total_products}")