In [None]:
# Product category analysis
# Used for further filtering of the data.

import os
import json
import time
# from keepa.interface import keepa_minutes_to_time, parse_csv

In [None]:
path_products_domain_1 = "data/keepa/products/domains/1"

In [None]:
def get_files(path):
	'''
		Returns all json files in the given path (including subdirectories)
	'''
	files = []
	for r, d, f in os.walk(path):
		for file in f:
			if ".json" in file:
				files.append(os.path.join(r, file))
	return files

files = get_files(path_products_domain_1)
print(f"Got {len(files)} files")

In [None]:
def parse_files(files: list) -> dict:
	'''
		Returns a dictionary with different objects for analyzing the categories of products quickly
	'''
	# category_id: int, product_id: str (ASIN)
	category_map = {} # category_id -> category_name # ; all categories
	category_tree = {} # category_id -> { subcategory_id -> ... } ; # tree of all categories
	category_products = {} # category_id -> { product_id -> True } ; # from product["categories"]
	category_products_leafs = {} # category_id -> { product_id -> True } ; # only leaf categories (last element in category_tree)
	products_categories = {} # product_id -> {category_id -> True} # uses root categories
	products = {} # product_id -> { product_data } # shrunk down product object data
	product_types = {} # type_id (string) -> { product_id -> True } # from product["type"]

	failed_files = []
	for i, file in enumerate(files):
		print(f"Processing file {i+1}/{len(files)} ({file})\r", end="")
		try:
			data = {}
			with open(file, "r") as f:
				data = json.load(f)
			product = data["products"][0] # we assume there is only one product per file as we query the API by ASIN
			asin = product["asin"]
			product_root_category = product["rootCategory"]
			product_categories = product["categories"]
			product_category_tree = product["categoryTree"]
			leaf_category = product_category_tree[-1]["catId"]
			product_type = product["type"]
			products[asin] = {
				"file": file,
				"asin": asin,
				"title": product["title"],
				"type": product_type,
				"root_category": product_root_category,
				"leaf_category": leaf_category,
				"categories": product_categories,
				"category_tree": product_category_tree,
			}

			# Process product types
			if product_type not in product_types:
				product_types[product_type] = {}
			product_types[product_type][asin] = True

			# Use leaf category to fill category_products_leafs
			# These are the actual specific categories that we want to use for filtering
			if leaf_category not in category_products_leafs:
				category_products_leafs[leaf_category] = {}
			category_products_leafs[leaf_category][asin] = asin

			
			# Process rootCategory first
			if product_root_category not in category_map:
				category_map[product_root_category] = product_root_category # if we don't know the name, we use the id still
			
			# Process categories
			for category in product_categories:
				if category not in category_map:
					category_map[category] = category # if we don't know the name, we use the id still
				if category not in category_products:
					category_products[category] = {}
				category_products[category][asin] = products[asin]
				if category not in products_categories:
					products_categories[asin] = {}
				products_categories[asin][category] = True
			
			# Fill category tree
			# the final category seems to coincide with the first element in "categories" - should we only use this
			# to match products so that a single product can only be in one "main" category?
			current_tree_level = category_tree # set it to the root
			for category_tree_object in product_category_tree:
				category_id = category_tree_object["catId"]
				category_name = category_tree_object["name"]
				# set name if not set yet or if its value is the id
				if category_id not in category_map or category_map[category_id] == category_id:
					category_map[category_id] = category_name
				# add it to the tree if not set yet
				if category_id not in current_tree_level:
					current_tree_level[category_id] = {}
				# go one level deeper
				current_tree_level = current_tree_level[category_id]
		except Exception as e:
			failed_files.append((file, e))
	return_object = {
		"category_map": category_map,
		"category_tree": category_tree,
		"category_products": category_products,
		"category_products_leafs": category_products_leafs,
		"products_category": products_categories,
		"product_types": product_types,
		"failed_files": failed_files,
		"products": products
	}
	print("") # to clean up the line
	return return_object

time_start = time.time()
parsed_data = parse_files(files)
time_end = time.time()
print(f"Finished parsing files in {round(time_end - time_start, 2)}s")

In [None]:
# Recursive count using the category_tree object
# using depth first search
def count_products_in_category(parsed_data: dict, category_id: int, current_node: dict = None) -> int:
	'''
		Returns the number of products in the given category (including subcategories) using the category_tree object
		from the parsed_data object.
		The category_id can be any part of the tree.
	'''
	# 1. find the node with the given category_id
	# 2. count the number of products for all subcategories
	# 3. return the sum of the counts
	# 4. if no subcategories, return the number of products for the given category
	# 5. if category_id is not in the tree, return -1
	# NOTE: category_id can be any part of the tree
	if current_node is None:
		current_node = parsed_data["category_tree"]
	if category_id in current_node: # we found the node from which we start counting
		total_count = 0
		if category_id in parsed_data["category_products"]: # if not we assume it's a root category and we don't count it
			total_count += len(parsed_data["category_products"][category_id])
		for subcategory_id in current_node[category_id]:
			total_count += count_products_in_category(parsed_data, subcategory_id, current_node[category_id])
		return total_count
	for subcategory_id in current_node:
		count = count_products_in_category(parsed_data, category_id, current_node[subcategory_id])
		if count != -1:
			return count
	return -1


def print_basic_statistic(parsed_data: dict):
	print(f"Number of categories: {len(parsed_data['category_map'])}")
	print(f"Number of products: {len(parsed_data['products_category'])}")
	print(f"Number of root categories: {len(parsed_data['category_tree'])}")
	# print root categories in human readable format
	for root_category_id in parsed_data["category_tree"]:
		# print(f" - {parsed_data['category_map'][root_category_id]}: {len(parsed_data['category_products'][root_category_id])}     (catId={root_category_id})")
		# we need a different way to print the product counts as we exclude root categories that are not in "categories" for any products
		# da a recursive count
		count = count_products_in_category(parsed_data, root_category_id)
		print(f" - {parsed_data['category_map'][root_category_id]}: {count}    (catId={root_category_id})")
	print(f"Number of failed files: {len(parsed_data['failed_files'])}")
	if len(parsed_data["failed_files"]) > 0:
		for failed_file in parsed_data["failed_files"]:
			print(f" - '{failed_file[0]}' failed because:  '{failed_file[1]}'")

print_basic_statistic(parsed_data)


In [None]:
def print_category_tree(parsed_data: dict, level: int = 0):
	'''
		Prints category tree in a readable format (substitutes category ids with category names)
	'''
	for category_id, subcategories in parsed_data["category_tree"].items():
		# indent_char = "\t"
		indent_chat = "    "
		indent = indent_chat * level
		print(indent + parsed_data["category_map"][category_id])
		print_category_tree({"category_tree": subcategories, "category_map": parsed_data["category_map"]}, level+1)

# Best inspected in a text editor (tab sizes here are 8, so it looks weird in Jupyter notebook cell output)
# print_category_tree(parsed_data)
# should just use the bottom function to print the tree as json with substitute names

In [None]:
def substitute_category_ids(parsed_data: dict) -> dict:
	'''
		Returns a category tree with human readable category names instead of category ids
	'''
	category_map = parsed_data["category_map"]
	category_tree = parsed_data["category_tree"]
	category_tree_substituted = {}
	for category_id, subcategories in category_tree.items():
		category_tree_substituted[category_map[category_id]] = substitute_category_ids({"category_map": category_map, "category_tree": subcategories})
	return category_tree_substituted


# Basic JSON print of the category tree
print(json.dumps(substitute_category_ids(parsed_data), indent=2))

In [None]:
# Print number of products for each category sorted by number of products
for category_id, products in sorted(parsed_data["category_products"].items(), key=lambda x: len(x[1]), reverse=True):
	print(f"{parsed_data['category_map'][category_id]}: {len(products)}")


In [None]:
# Get the number of products for each root category
def get_root_category_products_count(parsed_data: dict) -> dict:
	'''
		Returns a dictionary with root category ids as keys and number of products as values
	'''
	root_category_products_count = {}
	for asin, product in parsed_data["products"].items():
		root_category_id = product["root_category"]
		if root_category_id not in root_category_products_count:
			root_category_products_count[root_category_id] = 0
		root_category_products_count[root_category_id] += 1
	return root_category_products_count

root_category_products_count = get_root_category_products_count(parsed_data)
root_category_products_count = {parsed_data["category_map"][k]: v for k, v in root_category_products_count.items()}
print(f"{json.dumps(root_category_products_count, indent=2)}")

In [None]:
# Get all products for a given category (filter by root category) and return a list of files from parsed_data["products"]["file"]
files_electronics = []
for asin, product in parsed_data["products"].items():
	if product["root_category"] == 172282: # "Electronics"
		files_electronics.append(product["file"])
print(f"Got {len(files_electronics)} files for root category 'Electronics'")
print(f"Example file: {files_electronics[0]}")

In [None]:
parsed_data = parse_files(files_electronics)
# Print basic statistics
print_basic_statistic(parsed_data)

In [None]:
print(json.dumps(substitute_category_ids(parsed_data), indent=2))

In [None]:
# Print all products for all categories
def get_all_products_for_categories(parsed_data: dict) -> dict:
	'''
		Returns a dictionary with category ids as keys and a list of products as values
	'''
	category_products = parsed_data["category_products"]
	for category_id, products in category_products.items():
		print(f"{parsed_data['category_map'][category_id]} ({len(products)} products):")
		#category_products[category_id] = list(products.keys())
		for_sorting = []
		for product in products:
			# print(f"    - {parsed_data['products'][product]['title']} ; file={parsed_data['products'][product]['file']}")
			for_sorting.append(f"    - {parsed_data['products'][product]['title']} ; file={parsed_data['products'][product]['file']}")
		for_sorting.sort()
		for product in for_sorting:
			print(product)
	return category_products

pretty_category_products = get_all_products_for_categories(parsed_data)

In [None]:
# print(parsed_data["category_map"]["13900851"])

In [None]:
def print_product_types_count(product_types: dict):
	'''
		Prints the number of products per type
	'''
	print("Product types:")
	counts = {}
	for product_type in product_types:
		# print(f"{product_type}: {len(product_types[product_type])}")
		counts[product_type] = len(product_types[product_type])
	counts = {k: v for k, v in sorted(counts.items(), key=lambda item: item[1], reverse=True)}
	for product_type in counts:
		print(f"{product_type}: {counts[product_type]}")
	
print_product_types_count(parsed_data["product_types"])

In [None]:
def print_product_asins_for_type(product_types: dict, product_type: str, products: dict = {}):
	'''
		Prints all product ASINs for a given type
	'''
	print(f"Products for type {product_type}:")
	for asin in product_types[product_type]:
		if len(products) != 0:
			print(f"{asin}: {products[asin]['title']}")
		else:
			print(f"{asin}")

print_product_asins_for_type(parsed_data["product_types"], "AXE", parsed_data["products"]) # ???

In [None]:
def print_product_type_and_leaf_category(product_type: dict, products: dict, category_map: dict = {}):
	'''
		Prints the product type and leaf category for each product
	'''
	print("Product type and leaf category:")
	for asin in product_type:
		print(f"{asin}: {products[asin]['type']}: {category_map[products[asin]['leaf_category']]}          ({products[asin]['title']})")

print_product_type_and_leaf_category(parsed_data["product_types"]["AXE"], parsed_data["products"], parsed_data["category_map"])

# Conclusions

* we could pretty much rely almost entirely on product types and then further split by product leaf category for certain product types - for example COMPUTER_DRIVE_OR_STORAGE could be split by product leaf category to get separate insights for HDD, SSD, etc.

In [None]:
accepted_product_types="""COMPUTER_DRIVE_OR_STORAGE: 3465
MOTHERBOARD: 2251
MONITOR: 2077
INTERNAL_MEMORY: 1656
VIDEO_CARD: 1605
COMPUTER_PROCESSOR: 1553
INPUT_MOUSE: 1364
ELECTRONIC_COMPONENT_FAN: 1359
KEYBOARDS: 1081
SYSTEM_POWER_DEVICE: 996
NOTEBOOK_COMPUTER: 727
NETWORK_INTERFACE_CONTROLLER_ADAPTER: 314
PERSONAL_COMPUTER: 281
COMPUTER_CHASSIS: 221
KEYBOARD_MOUSE_SET: 208
FLASH_MEMORY: 184
ELECTRONIC_CABLE: 154
CHARGING_ADAPTER: 147
HEADPHONES: 122
None: 105
NETWORKING_ROUTER: 100
"""

accepted_product_types = accepted_product_types.split("\n")
accepted_product_types = [x.strip() for x in accepted_product_types]
accepted_product_types = [x.split(": ")[0].strip() for x in accepted_product_types]
accepted_product_types = [x for x in accepted_product_types if x != ""]

for product_type in accepted_product_types:
	print(f"\"{product_type}\",")

In [None]:
# Copy from above and clear any unwanted types
accepted_product_types = [
	"COMPUTER_DRIVE_OR_STORAGE",
	"MOTHERBOARD",
	"MONITOR",
	"INTERNAL_MEMORY",
	"VIDEO_CARD",
	"COMPUTER_PROCESSOR",
	"INPUT_MOUSE",
	"ELECTRONIC_COMPONENT_FAN",
	"KEYBOARDS",
	"SYSTEM_POWER_DEVICE",
	"NOTEBOOK_COMPUTER",
	"NETWORK_INTERFACE_CONTROLLER_ADAPTER",
	"PERSONAL_COMPUTER",
	"COMPUTER_CHASSIS",
	#"KEYBOARD_MOUSE_SET",
	"FLASH_MEMORY",
	"ELECTRONIC_CABLE",
	"CHARGING_ADAPTER",
	"HEADPHONES",
	#"None",
	"NETWORKING_ROUTER",
]


In [None]:
# Split the products with type "COMPUTER_DRIVE_OR_STORAGE" into categories using the leaf category
def split_products_by_leaf_category(product_type: dict, products: dict) -> dict:
	'''
		Returns a dictionary with leaf category ids as keys and a dictionary of product ASINs as values
	'''
	leaf_category_products = {}
	for asin in product_type:
		leaf_category = products[asin]["leaf_category"]
		if leaf_category not in leaf_category_products:
			leaf_category_products[leaf_category] = {}
		leaf_category_products[leaf_category][asin] = True
	return leaf_category_products

def print_leaf_category_products(leaf_category_products: dict, products: dict, category_map: dict, leaf_category_name: str, product_types_count: int):
	'''
		Prints the leaf category product counts for a given product type
	'''
	print(f"Leaf categories and their product counts for product type '{leaf_category_name}' ({len(leaf_category_products)} categories, {product_types_count} products total):")
	for leaf_category in leaf_category_products:
		print(f"- {category_map[leaf_category]} (catId={leaf_category}) has {len(leaf_category_products[leaf_category])} products")
		# for asin in leaf_category_products[leaf_category]:
		# 	print(f" - {products[asin]['title']}")

computer_drive_or_storage_categories = split_products_by_leaf_category(parsed_data["product_types"]["COMPUTER_DRIVE_OR_STORAGE"], parsed_data["products"])
print_leaf_category_products(computer_drive_or_storage_categories, parsed_data["products"], parsed_data["category_map"], "COMPUTER_DRIVE_OR_STORAGE", len(parsed_data["product_types"]["COMPUTER_DRIVE_OR_STORAGE"]))

In [None]:
# What exactly is "Internal Components" (catId=17923671011) which has 1148 products?
for asin in computer_drive_or_storage_categories[17923671011]:
	product = parsed_data['products'][asin]
	print(f"{asin}: {product['title']}")

In [None]:
hmm = parsed_data["products"]["B0978VB5TF"]
print(hmm["title"])
print(hmm["file"])
print(parsed_data["category_map"][hmm["leaf_category"]])
print("")
for category in hmm["categories"]:
	print(parsed_data["category_map"][category])

In [None]:
# Looks like we can still maybe separate "Internal Components" into "Internal Solid State Drives" and "Internal Hard Drives"

In [None]:
# Try getting categories for all products in the "Internal Components" category for the "COMPUTER_DRIVE_OR_STORAGE" type
# and see if we can separate them into "Internal Solid State Drives" and "Internal Hard Drives" or something else

def get_category_counts_for_17923671011(computer_drive_or_storage_categories: dict, parsed_data: dict):
	'''
		Returns a dictionary with category ids as keys and a dictionary of product ASINs as values
	'''
	category_counts = {}
	for asin in computer_drive_or_storage_categories[17923671011]:
		product = parsed_data["products"][asin]
		for category in product["categories"]:
			if category not in category_counts:
				category_counts[category] = {}
			category_counts[category][asin] = True
	return category_counts

category_counts = get_category_counts_for_17923671011(computer_drive_or_storage_categories, parsed_data)
for category in category_counts:
	print(f"{parsed_data['category_map'][category]} (catId={category}) has {len(category_counts[category])} products")

In [None]:
# Loop over again and return the dictionary with ASINs if product has cateogory 1292116011 (Internal Solid State Drives) or 1254762011 (Internal Hard Drives)

def get_disk_categories_from_internal_components(computer_drive_or_storage_categories: dict, parsed_data: dict) -> dict:
	'''
		Returns a dictionary with category ids as keys and a dictionary of product ASINs as values
	'''
	disk_categories = {
		1254762011: {}, # Internal Hard Drives
		1292116011: {}, # Internal Solid State Drives
	}
	for asin in computer_drive_or_storage_categories[17923671011]:
		product = parsed_data["products"][asin]
		for category in product["categories"]:
			if category in disk_categories:
				disk_categories[category][asin] = True
	return disk_categories
	
newly_split_internal_components = get_disk_categories_from_internal_components(computer_drive_or_storage_categories, parsed_data)
for category in newly_split_internal_components:
	print(f"{parsed_data['category_map'][category]} (catId={category}) has {len(newly_split_internal_components[category])} products")

In [None]:
# Add these back to the pool of categories for the "COMPUTER_DRIVE_OR_STORAGE" type using computer_drive_or_storage_categories
# and remove the "Internal Components" category

# computer_drive_or_storage_categories[1254762011] = newly_split_internal_components[1254762011]

computer_drive_or_storage_categories_fixed = {}
for category in computer_drive_or_storage_categories:
	if category != 17923671011:
		computer_drive_or_storage_categories_fixed[category] = computer_drive_or_storage_categories[category]
for category in newly_split_internal_components:
	for asin in newly_split_internal_components[category]:
		computer_drive_or_storage_categories_fixed[category][asin] = True

# Print new counts
print_leaf_category_products(computer_drive_or_storage_categories_fixed, parsed_data["products"], parsed_data["category_map"], "COMPUTER_DRIVE_OR_STORAGE", len(parsed_data["product_types"]["COMPUTER_DRIVE_OR_STORAGE"]))

# Conclusions

* we have successfully split the data for internal components using internal SSD and HDD category ids - if we look up the counts, we have reduced the number of categories by 1 while the number of SSD and HDD increased by the number of those identified from internal components

# TODO

* edit the types for COMPUTER_DRIVE_OR_STORAGE in such a way that we remove that category and add specific subcategories:
	* Internal Solid State Drives (catId=1292116011) has 1659 products
	* Internal Hard Drives (catId=1254762011) has 1284 products
	* External Hard Drives (catId=595048) has 316 products
	* External Solid State Drives (catId=3015429011) has 126 products

In [None]:
# Group products by type using the new split categories for "COMPUTER_DRIVE_OR_STORAGE"

# remove COMPUTER_DRIVE_OR_STORAGE from accepted_product_types
accepted_product_types = [x for x in accepted_product_types if x != "COMPUTER_DRIVE_OR_STORAGE"]

# add the new categories to the start of the list
to_add = [
	"Internal Solid State Drives",
	"Internal Hard Drives",
	"External Hard Drives",
	"External Solid State Drives",
]
for i in range(len(to_add)):
	to_add[i] = to_add[i].upper().replace(" ", "_")

print(json.dumps(to_add, indent=2))


In [None]:
# * Internal Solid State Drives (catId=1292116011) has 1659 products
# * Internal Hard Drives (catId=1254762011) has 1284 products
# * External Hard Drives (catId=595048) has 316 products
# * External Solid State Drives (catId=3015429011) has 126 products
accepted_product_types_added = to_add + accepted_product_types
accepted_product_types_dict = {x: -1 for x in accepted_product_types_added}
accepted_product_types_dict["INTERNAL_SOLID_STATE_DRIVES"] = 1292116011
accepted_product_types_dict["INTERNAL_HARD_DRIVES"] = 1254762011
accepted_product_types_dict["EXTERNAL_HARD_DRIVES"] = 595048
accepted_product_types_dict["EXTERNAL_SOLID_STATE_DRIVES"] = 3015429011
print(json.dumps(accepted_product_types_dict, indent=2))


In [None]:
# Construct a new dictionary with the new categories
# keys are types or categories and values are modified dictionaries of products from parsed_data

def get_products_for_categories(parsed_data: dict, storage_categories: dict, accepted_product_types: dict) -> dict:
	'''
		Returns a dictionary with category ids as keys and a dictionary of product ASINs as values
	'''
	products_for_categories = {}
	for product_type, category_id in accepted_product_types.items():
		products_for_categories[product_type] = {}
		if category_id != -1: # if it's a split category for "COMPUTER_DRIVE_OR_STORAGE"
			for asin in storage_categories[category_id]:
				products_for_categories[product_type][asin] = parsed_data["products"][asin]
		else: # if it's a normal category from types
			for asin in parsed_data["product_types"][product_type]:
				products_for_categories[product_type][asin] = parsed_data["products"][asin]
	return products_for_categories

products_for_categories = get_products_for_categories(parsed_data, computer_drive_or_storage_categories_fixed, accepted_product_types_dict)
total_products = 0
print("Products for constructed categories:")
for product_type in products_for_categories:
	print(f"{product_type}: {len(products_for_categories[product_type])}")
	total_products += len(products_for_categories[product_type])
print("")
print(f"Total products: {total_products}")

In [None]:
# Validate that all products across categories are unique

def validate_uniqueness_of_products(products_for_categories: dict) -> int:
	all_products_test = {}
	duplicates_count = 0
	for product_type in products_for_categories:
		for asin in products_for_categories[product_type]:
			if asin in all_products_test:
				print(f"Product {asin} is in multiple categories: {all_products_test[asin]} and {product_type}")
				all_products_test[asin].append(product_type)
				duplicates_count += 1
			else:
				all_products_test[asin] = [product_type]
	print(f"Total duplicates: {duplicates_count}")
	return duplicates_count

duplicates_count = validate_uniqueness_of_products(products_for_categories)

In [None]:
hmm = parsed_data["products"]["B005JZLINE"]
print(hmm["title"])

In [None]:
# Remove B005JZLINE from INTERNAL_SOLID_STATE_DRIVES as it's a HDD
del products_for_categories["INTERNAL_SOLID_STATE_DRIVES"]["B005JZLINE"]

In [None]:
duplicates_count = validate_uniqueness_of_products(products_for_categories)

In [None]:
total_products = 0
print("Products for constructed categories:")
for product_type in products_for_categories:
	print(f"{product_type}: {len(products_for_categories[product_type])}")
	total_products += len(products_for_categories[product_type])
print("")
print(f"Total products: {total_products}")

In [None]:
# Simplify by only storing file, asin, title for each product
products_for_categories_simple = {}
for category in products_for_categories:
	products_for_categories_simple[category] = {}
	for asin in products_for_categories[category]:
		products_for_categories_simple[category][asin] = {
			"asin": products_for_categories[category][asin]["asin"],
			"title": products_for_categories[category][asin]["title"],
			"file": products_for_categories[category][asin]["file"],
		}

In [None]:
# Save to file
path_categories_domain_1 = "data/keepa/generated/categories-domain-1.json"
if not os.path.exists(os.path.dirname(path_categories_domain_1)):
	os.makedirs(os.path.dirname(path_categories_domain_1))
with open(path_categories_domain_1, "w") as f:
	json.dump(products_for_categories_simple, f, indent=2)

In [None]:
# Create audit file (for manual inspection)
path_categories_domain_1_audit = "data/keepa/generated/categories-domain-1-audit.txt"
if not os.path.exists(os.path.dirname(path_categories_domain_1_audit)):
	os.makedirs(os.path.dirname(path_categories_domain_1_audit))
with open(path_categories_domain_1_audit, "w") as f:
	for category in products_for_categories:
		f.write(f"{category} ({len(products_for_categories[category])} products):\n")
		for asin in products_for_categories[category]:
			f.write(f" - {asin}: {products_for_categories[category][asin]['title']}\n")
		f.write("\n")