# Tool for analyzing and processing the generated site index file

The last step converts the limited index object to a set of tasks that will be executed by the scheduler.

In [None]:
# imports
import json
import copy
import time
from typing import Optional

In [None]:
# Constants and globals
filepath_index = "data/scraped/pangoly/index.json"
print(f"Loading index from {filepath_index}")

In [None]:
# Load index
index = json.loads(open(filepath_index).read())

In [None]:
# Gets the dictionary with categories count, number of skipped categories and number of products in each category
def get_categories_info(index_obj: dict):
	categories_info = {}
	categories_info["categories_count"] = len(index_obj["categories"])
	categories_info["categories_skipped_count"] = 0
	categories_info["categories_products_count"] = {}
	categories_info["products_total_count"] = 0
	for category_key, category in index_obj["categories"].items():
		if category["products"] == None:
			# None means we didn't walk this category
			categories_info["categories_skipped_count"] += 1
		else:
			products = category["products"]
			categories_info["products_total_count"] += len(products)
			categories_info["categories_products_count"][category_key] = len(products)
	return categories_info

# Print number of products in each category
def print_category_info(index_obj: dict, include_skipped: bool = False):
	categories_info = get_categories_info(index_obj)
	print(f"Products indexed: {categories_info['products_total_count']}")
	print(f"Categories: {categories_info['categories_count']} ({categories_info['categories_skipped_count']} of those are skipped)")
	for category_key, category in index_obj["categories"].items():
		if category["products"] == None:
			# None means we didn't walk this category
			if include_skipped:
				print(f" - {category_key} ({category['name']}): (skipped)")
		else:
			products = category["products"]
			print(f" - {category_key} ({category['name']}): {len(products)} ({round(len(products) / categories_info['products_total_count'] * 100, 2)}%)")
	# Print products info



print_category_info(index, include_skipped=True)

In [None]:
# Get non-destructive deep copy of the index object with a limited list of products for each category (pick first n)
def limit_categories(index_obj: dict, limit: int) -> dict:
	index_limited = copy.deepcopy(index_obj)
	for category_key, category in index_limited["categories"].items():
		if category["products"] != None:
			products_slice = dict(list(category["products"].items())[:limit])
			index_limited["categories"][category_key]["products"] = products_slice
	return index_limited
	
limit = 500
print(f"Limiting categories to {limit} products each")
index_limited = limit_categories(index, limit)
print_category_info(index_limited)

In [None]:
# Get rough estimates of scraping time for the whole index depending on different parameters
# One products scraped by being run as a task designated to a separate worker process by a scheduler
def print_scraping_estimates(index_obj: dict, seconds_per_task: float, max_workers: int):
	categories_info = get_categories_info(index_obj)
	products_total_count = categories_info["products_total_count"]
	time_total_sequential = products_total_count * seconds_per_task
	time_total_parallel = time_total_sequential / max_workers
	print(f"Scraping estimates for {products_total_count} products:")
	print(f" - sequential: {round(time_total_sequential / 60, 2)} minutes ({round(time_total_sequential / 60 / 60, 2)} hours)")
	print(f" - parallel ({max_workers} workers): {round(time_total_parallel / 60, 2)} minutes ({round(time_total_parallel / 60 / 60, 2)} hours)")

seconds_per_task = 65.7 # 65.7
max_workers = 30 # 30
print(f"Scraping estimates for {seconds_per_task} seconds per task and {max_workers} workers:")
print(f"index:")
print_scraping_estimates(index, seconds_per_task, max_workers)
print(f"index_limited:")
print_scraping_estimates(index_limited, seconds_per_task, max_workers)


In [None]:
# Converts index object to a list of tasks that can be consumed by the scheduler
# Example of a task:
# {
# 	"metadata": {
# 		"handle": "handle_1",
# 		"retries": 1,
# 		"can_handle_output": false,
# 		"can_handle_error": false
# 	},
# 	"args": [
# 		"python",
# 		"src/py/utils/scheduler/dummy_task.py"
# 	]
# }

def convert_to_tasks(index_obj: dict) -> list:
	tasks = []
	for category_key, category in index_obj["categories"].items():
		if category["products"] != None:
			for product_key, product in category["products"].items():
				task = {
					"metadata": {
						"handle": f"{category_key}/{product_key}",
						"retries": 1,
						"can_handle_output": False,
						"can_handle_error": False
					},
					"args": [
						"python",
						"src/py/scraping/pangoly/scraper_worker.py",
						product_key
					]
				}
				tasks.append(task)
	return tasks

tasks = convert_to_tasks(index_limited)
print(f"Converted index to {len(tasks)} tasks")
print(f"Example task:")
print(json.dumps(tasks[0], indent=2))

# Save tasks to file (same filepath as index but with .tasks.json extension)
filepath_tasks = filepath_index.replace(".json", ".tasks.json")
print(f"Saving tasks to {filepath_tasks}")
time_start = time.time()
with open(filepath_tasks, "w") as f:
	f.write(json.dumps(tasks, indent=2))
time_end = time.time()
print(f"Saved tasks in {round(time_end - time_start, 2)} seconds")
print(f"Done")

In [None]:
# TODO: make a parameter that describes number of repetitions for category {"cpu": 2, "vga": 1} means take 2 cpus and 1 vga per loop
def convert_to_tasks_round_robin(index_obj: dict, allowed_categories: list, total_limit: int = -1, task_dict: Optional[dict] = None) -> dict:
	'''
		Converts index object to a list of tasks for allowed categories in a round-robin fashion
		Keys are task handles, values are tasks - for easier merging
		task_dict is used for not adding duplicate tasks
	'''
	tasks = {}
	if task_dict != None:
		tasks = task_dict
		total_limit += len(tasks)
	categories = {}
	index_obj_copy = copy.deepcopy(index_obj)
	for category in allowed_categories:
		categories[category] = index_obj_copy["categories"][category]
	while len(categories) > 0:
		if len(tasks) >= total_limit and total_limit != -1:
			break
		for category_key, category in categories.items():
			if len(tasks) >= total_limit and total_limit != -1:
				break
			products = category["products"]
			if len(products) == 0:
				del categories[category_key]
				break
			# get product from the end of the list (get oldest product first)
			product_key, product = products.popitem()
			# get product from the start of the list (get newest product first)
			# product_key, product = products.pop(list(products.keys())[0])
			task = {
					"metadata": {
						"handle": f"{category_key}/{product_key}",
						"retries": 1,
						"can_handle_output": False,
						"can_handle_error": False
					},
					"args": [
						"python",
						"src/py/scraping/pangoly/scraper_worker.py",
						product_key
					]
				}
			if task["metadata"]["handle"] not in tasks:
				tasks[task["metadata"]["handle"]] = task
	return tasks

def split_list(lst: list, n: int) -> list:
	size = len(lst) // n
	remainder = len(lst) % n
	splits = [size + 1] * remainder + [size] * (n - remainder)
	return [lst[sum(splits[:i]):sum(splits[:i+1])] for i in range(n)]

# l = split_list([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 3) # produces [[1, 2, 3, 4], [5, 6, 7], [8, 9, 10]]

def save_splits(splits: list):
	'''
		Saves a list of splits to a number of files
	'''
	for i, split in enumerate(splits):
		filepath = filepath_index.replace(".json", f"-rr-split_{i}.tasks.json")
		print(f"Saving split '{i}' to '{filepath}'")
		with open(filepath, "w") as f:
			f.write(json.dumps(split, indent=2))
		print(f"Done")



In [None]:

# task_dict = convert_to_tasks_round_robin(index_limited, ["cpu", "vga"], 4)
# task_dict = convert_to_tasks_round_robin(index_limited, ["ram", "ssd"], 4, task_dict)
# task_dict = convert_to_tasks_round_robin(index_limited, ["cpu", "vga"], 4, task_dict)

# Dictionary in steps
task_dict = convert_to_tasks_round_robin(index_limited, ["cpu", "vga"], 200)
task_dict = convert_to_tasks_round_robin(index_limited, ["ram", "ssd"], 200, task_dict)
task_dict = convert_to_tasks_round_robin(index_limited, ["cpu", "vga"], 800, task_dict)
task_dict = convert_to_tasks_round_robin(index_limited, ["ram", "ssd"], 800, task_dict)

# # print handles
print("Task handles:")
for i, task in enumerate(task_dict.values()):
	print(f" * {task['metadata']['handle']}")
	if i >= 10:
		break

# print counts by category
print("Task counts by category:")
category_counts = {}
for handle in task_dict.keys():
	category = handle.split("/")[0]
	if category not in category_counts:
		category_counts[category] = 0
	category_counts[category] += 1
for category, count in category_counts.items():
	print(f" * {category}: {count}")

# convert to tasklist
task_list = list(task_dict.values())
print(f"Converted index to {len(task_list)} tasks (round-robin)")
print(f"Example task:")
print(json.dumps(task_list[0], indent=2))

# Save tasks to file (same filepath as index but with .tasks.json extension)
filepath_tasks = filepath_index.replace(".json", "-rr.tasks.json")
print(f"Saving tasks to {filepath_tasks}")
time_start = time.time()
with open(filepath_tasks, "w") as f:
	f.write(json.dumps(task_list, indent=2))
time_end = time.time()
print(f"Saved tasks in {round(time_end - time_start, 2)} seconds")

# Split tasks into 3 files (for multiple devices to run in parallel)
splits = split_list(task_list, 3)
print("Split sizes (tasks per split):")
for i, split in enumerate(splits):
	print(f" * {i}: {len(split)}")
print("Saving splits...")
save_splits(splits)
print("Done")