In [None]:
import os
import sys
import json
from typing import Tuple

# Project imports
sys.path.append(os.getcwd())
from src.py.utils.trash import move_to_trash, get_trash_path

In [None]:
path_output_root = "data/keepa/products/domains"
# domain_id: Amazon domain ID - Valid values: [ 1: com | 2: co.uk 231 | 3: de | 4: fr | 5: co.jp | 6: ca | 8: it | 9: es | 10: in | 11: com.mx ]
domains_map = {
	"1": "com",
	"2": "co.uk",
	"3": "de",
	"4": "fr",
	"5": "co.jp",
	"6": "ca",
	"8": "it",
	"9": "es",
	"10": "in",
	"11": "com.mx",
}
domains = sorted(os.listdir(path_output_root))
domains = {domain: {"path": os.path.join(path_output_root, domain)}for domain in domains}
path_parsed_products = "data/scraped/camel/parsed-products.json"

print(f"Domains: {json.dumps(domains, indent=2)}")

In [None]:
def get_all_json_files(path):
	json_files = []
	for root, dirs, files in os.walk(path):
		for file in files:
			if file.endswith('.json'):
				json_files.append(os.path.join(root, file))
	return json_files


# json_files = get_all_json_files(path_output_root)
# print(f"Got {len(json_files)} json files")

for domain in domains:
	domains[domain]["json_files"] = get_all_json_files(domains[domain]["path"]) # type: ignore
	print(f"Domain {domain} got {len(domains[domain]['json_files'])} json files") # type: ignore

In [None]:
def get_failed_files(json_files: list) -> list:
	"""
	:param json_files: list of json files
	:return: number of files with error and total number of files
	"""
	failed_files = []
	for i, file in enumerate(json_files):
		asin = file.split("/")[-1].split(".")[0]
		print(f"Checking file {i+1}/{len(json_files)}: {asin}", end="\r")
		try:
			with open(file) as f:
				data = json.load(f)
				if "error" in data:
					failed_files.append((asin, data["error"]))
		except Exception as e:
			failed_files.append((asin, f"File parsing error: {e}"))
	return failed_files

# TODO: if you get failed files, create a function that returns
#       a dictionary of unique errors as keys and a list of asins as values
#       then sort the dictionary by the number of asins in the list
def get_errors_dict(failed_files: list) -> dict:
	"""
	:param failed_files: list of tuples (asin, error)
	:return: dictionary of unique errors as keys and a list of asins as values
	"""
	errors_dict = {}
	for asin, error in failed_files:
		error_short = error[:100]
		if error_short not in errors_dict:
			errors_dict[error_short] = [asin]
		else:
			errors_dict[error_short].append(asin)
	return errors_dict

def get_errors_dict_sorted(errors_dict: dict) -> dict:
	"""
	:param errors_dict: dictionary of unique errors as keys and a list of asins as values
	:return: sorted dictionary of unique errors as keys and a list of asins as values
	"""
	errors_dict_sorted = {k: v for k, v in sorted(errors_dict.items(), key=lambda item: len(item[1]), reverse=True)}
	return errors_dict_sorted

def get_errors_dict_sorted_counts(errors_dict_sorted: dict) -> dict:
	"""
	:param errors_dict_sorted: sorted dictionary of unique errors as keys and a list of asins as values
	:return: sorted dictionary of unique errors as keys and a list of asins as values
	"""
	errors_dict_sorted_counts = {}
	for k, v in errors_dict_sorted.items():
		errors_dict_sorted_counts[k] = len(v)
	return errors_dict_sorted_counts

# failed_files = get_failed_files(json_files)
# print(f"Got {len(failed_files)} failed files")
# if len(failed_files) > 0:
# 	errors_dict = get_errors_dict(failed_files)
# 	errors_dict_sorted = get_errors_dict_sorted(errors_dict)
# 	errors_dict_sorted_counts = get_errors_dict_sorted_counts(errors_dict_sorted)
# 	print(f"Number of unique errors: {len(errors_dict_sorted_counts)}")
# 	print(f"Errors: {json.dumps(errors_dict_sorted_counts, indent=2)}")

print("Checking domains for failed files...")
for domain in domains:
	failed_files = get_failed_files(domains[domain]["json_files"]) # type: ignore
	domains[domain]["failed_files"] = failed_files # type: ignore
	print("")
	print(f"Domain {domain} got {len(failed_files)} failed files") # type: ignore
	if len(failed_files) > 0:
		errors_dict = get_errors_dict(failed_files)
		errors_dict_sorted = get_errors_dict_sorted(errors_dict)
		errors_dict_sorted_counts = get_errors_dict_sorted_counts(errors_dict_sorted)
		domains[domain]["errors_dict_sorted"] = errors_dict # type: ignore
		domains[domain]["errors_dict_sorted_counts"] = errors_dict_sorted_counts # type: ignore
		print(f"Domain {domain} errors ({len(errors_dict_sorted_counts)} unique errors): {json.dumps(errors_dict_sorted_counts, indent=2)}") # type: ignore
	print("")


In [None]:
def delete_failed_files(failed_files: list):
	"""
	:param failed_files: list of tuples (asin, error)
	:return: None
	"""
	for asin, error in failed_files:
		filepath = f"{path_output_root}/{asin}.json"
		trash_path_info = get_trash_path(filepath)
		print(f"Moving file '{filepath}' to trash '{trash_path_info}'")
		err = move_to_trash(filepath)
		if err:
			print(f"  Error moving file '{filepath}' to trash '{trash_path_info}': {err}")
		else:
			print(f"  OK")


# WARNING: uncomment the line below to delete failed files
# delete_failed_files(failed_files)

In [None]:
def get_averages(json_files: list) -> Tuple[dict, dict]:
	'''
		Returns a dictionary with the average values of specific keys in the json files
	'''
	# Example json file:
	# {
	# 	"tokensLeft": 3,
	# 	"refillRate": 5,
	# 	"tokenFlowReduction": 0.0,
	# 	"tokensConsumed": 2,
	# 	"processingTimeInMs": 1,
	# }
	averages = {
	    "tokensLeft": 0.,  # should be around 5
	    "refillRate": 0.,  # should be around 5 tokens per minute
	    "tokenFlowReduction": 0.,  # should be 0
	    "tokensConsumed": 0.,  # should be around 2 tokens per request
	    "processingTimeInMs": 0.,  # should be around 500
	}
	successfully_processed_files = 0
	failed_files = {}
	for i, json_file in enumerate(json_files):
		print(f"Processing file {i+1}/{len(json_files)}: {json_file}", end="\r")
		try:
			with open(json_file, 'r') as f:
				data = json.load(f)
				# check if we have all keys first
				for key in averages.keys():
					if key not in data:
						raise Exception(f"Key '{key}' not in file '{json_file}'")
				for key in averages.keys():
					averages[key] += data[key]
			successfully_processed_files += 1
		except Exception as e:
			print(f"Error processing file '{json_file}': '{e}'")
			failed_files[json_file] = e
	for key in averages.keys():
		averages[key] /= successfully_processed_files
	# assuming we always hit token limits and use average token refill rate for minute
	requests_per_minute = averages["refillRate"] / averages["tokensConsumed"]
	averages["requestsPerMinute"] = requests_per_minute
	return averages, failed_files


# averages = get_averages(json_files)
# print(f"Averages: {json.dumps(averages, indent=2)}")

for domain in domains:
	print(f"Getting averages for domain {domain}...")
	averages, failed_files = get_averages(domains[domain]["json_files"]) # type: ignore
	print("")
	domains[domain]["averages"] = averages # type: ignore
	domains[domain]["failed_files_averages"] = failed_files # type: ignore
	print(f"Domain {domain} averages: {json.dumps(averages, indent=2)}") # type: ignore
	print("")

In [None]:
for domain in domains:
	print(f"Domain {domain} summary:")
	print(f"{len(domains[domain]['failed_files_averages'])}/{len(domains[domain]['json_files'])} failed files") # type: ignore
	print(f"Domain {domain} averages: {json.dumps(domains[domain]['averages'], indent=2)}") # type: ignore
	print("")

In [None]:
def get_time_estimates(averages: dict,
                       product_count: int) -> Tuple[float, float]:
	'''
		Prints time estimates
	'''
	requests_per_minute = averages["requestsPerMinute"]
	seconds_needed = product_count / requests_per_minute * 60
	days_needed = seconds_needed / 60 / 60 / 24
	# print(f"Seconds needed: {seconds_needed}")
	# print(f"Days needed: {days_needed}")
	return seconds_needed, days_needed


# print(
#     f"Current speed: {averages['requestsPerMinute']} requests per minute ({averages['requestsPerMinute'] * 60 * 24} per day)"
# )
# print()
# # products_count = 13884 * 3 # assuming we have 3 domains - US, UK, DE
# # product_count equals to the number of products in the parsed-products.json file
# parsed_products = {}
# with open(path_parsed_products, 'r') as f:
# 	parsed_products = json.load(f)
# products_count = len(parsed_products)
# domains = set(["US", "UK", "DE"])
# products_count = products_count * len(domains)
# products_left = products_count - len(json_files)
# print(f"Products total: {products_count}")
# seconds_total, _ = get_time_estimates(averages, products_count)
# print()
# print(f"Products left: {products_left}")
# seconds_left, _ = get_time_estimates(averages, products_left)
# print()
# print(f"Progress: {len(json_files) / products_count * 100:.2f}%")

for domain in domains:
	seconds_total, days_total = get_time_estimates(domains[domain]["averages"], len(domains[domain]["json_files"])) # type: ignore
	domains[domain]["seconds_total"] = seconds_total # type: ignore
	domains[domain]["days_total"] = days_total # type: ignore
	print(f"Domain {domain} took {seconds_total:.2f} seconds ({days_total:.2f} days) to fetch data for {len(domains[domain]['json_files'])} products") # type: ignore
	print("")

In [None]:
def pretty_print_averages(averages: dict):
	print(f" - tokensLeft: {averages['tokensLeft']:.2f} (should be around 5)")
	print(f" - refillRate: {averages['refillRate']:.2f} (should be around 5 tokens per minute)")
	print(f" - tokenFlowReduction: {averages['tokenFlowReduction']:.2f} (should be 0)")
	print(f" - tokensConsumed: {averages['tokensConsumed']:.2f} (should be around 1 - 2 tokens per request)")
	print(f" - processingTimeInMs: {averages['processingTimeInMs']:.2f} (should be around 500)")
	print(f" - requestsPerMinute: {averages['requestsPerMinute']:.2f} (should be around 2 - 4 requests per minute)")

summary_seconds_total = 0
summary_days_total = 0
summary_count_total = 0
summary_success_total = 0

# Summary for all domains (entire process)
for domain in domains:
	domain_name = domains_map[domain]
	domain_name = f"amazon.{domain_name}"
	print(f"Domain '{domain_name}' (domain_id = '{domain}') summary:")
	products_total = len(domains[domain]["json_files"]) # type: ignore
	products_failed = len(domains[domain]["failed_files"]) # type: ignore
	products_success_count = products_total - products_failed
	success_rate = products_success_count / products_total * 100 # type: ignore
	print(f" - fetched data for {products_success_count}/{products_total} products ({success_rate:.3f} %)") # type: ignore
	errors = domains[domain]["errors_dict_sorted"] # type: ignore
	errors_unique = domains[domain]["errors_dict_sorted_counts"] # type: ignore
	most_common_error = list(errors_unique.keys())[0] # type: ignore
	print(f" - {products_failed} invalid results ({100 - success_rate:.3f} %) with {len(errors_unique)} unique errors") # type: ignore
	print(f" - most common error: '{most_common_error[:50]}...' with {errors_unique[most_common_error]} products") # type: ignore
	print(f"'{domain_name}' averages:")
	pretty_print_averages(domains[domain]["averages"]) # type: ignore
	seconds_total = domains[domain]["seconds_total"] # type: ignore
	days_total = domains[domain]["days_total"] # type: ignore
	summary_seconds_total += seconds_total # type: ignore
	summary_days_total += days_total # type: ignore
	summary_count_total += products_total
	summary_success_total += products_success_count
	print(f" - took {seconds_total:.2f} seconds ({days_total:.2f} days) to fetch data for {products_total} products") # type: ignore
	print("")
print("")
print(f"Managed to fetch data for {summary_success_total}/{summary_count_total} products ({summary_success_total / summary_count_total * 100:.3f} %)") # type: ignore
print("")
print(f"The entire process took {summary_seconds_total:.2f} seconds ({summary_days_total:.2f} days) non-stop fetching") # type: ignore


In [None]:
# Simple calculation of time needed to fetch all products
tokens_per_request = 1.5
refill_rate = 5
requests_per_minute = refill_rate / tokens_per_request
requests_per_day = requests_per_minute * 60 * 24
print(f"Requests per minute: {requests_per_minute}")
print(f"Requests per day: {requests_per_day}")
files_count = len(domains["1"]["json_files"]) # type: ignore
time_to_fetch_all_products = files_count / requests_per_day
print(f"Time to fetch all products for a single domain ({files_count}): {time_to_fetch_all_products} days")
time_to_fetch_all_domains = time_to_fetch_all_products * len(domains)
print(f"Time to fetch products for all domains ({len(domains)}): {time_to_fetch_all_domains} days")