In [None]:
import os
import sys
import json
import time
import pandas as pd
import numpy as np
from scipy import stats
from plotly import graph_objects as go
from plotly.subplots import make_subplots
from plotly import express as px
from scipy import stats


# Project imports
sys.path.append(os.getcwd())
# from src.py.utils.generic_utils import wrapper
from src.py.scraping.keepa.keepa_analysis_utils import load_result_object, parse_csv, organize_csv, discretize_csv_smart, get_trends, get_timeseries_from_trends, remove_outliers, remove_outliers_csv

# Finance utils
from src.py.analysis.yahoo.stocks.finance_df_utils import add_vline_annotation
# Events
from src.py.analysis.events import events
# CPI
import importlib
sys.path.append(os.getcwd())
cpi_adjust = importlib.import_module("src.py.scraping.world-bank.cpi_adjust")
cpi_adjust.initialize_cpi(date_cutoff="2016-08-01", jagged=False) # jagged=True)

In [None]:
# Couldn't be bothered with releasing memory so you'll need to allocate like 12 GB of RAM to the VM for this to work (the last visualization)

In [None]:
# Variables
path_category_index = "data/keepa/generated/categories-domain-1.json"
path_output_root = "data/keepa/generated/plots"
path_csv_root = "data/keepa/generated/csv"
if not os.path.exists(path_output_root):
	os.makedirs(path_output_root)
if not os.path.exists(path_csv_root):
	os.makedirs(path_csv_root)
domain_name_map = { # for fig title
	"1": "amazon.com",
	"2": "amazon.co.uk",
	"3": "amazon.de",
	"5": "amazon.co.jp",
}
domain_country_map = { # for CPI - use domain id as filename
	"1": "USA",
	"2": "GBR",
	"3": "DEU",
	"5": "JPN",
}

# We use the same product index for all domains
def get_index_categories(domain: str) -> dict:
	# Load index categories
	index_categories = json.load(open(path_category_index))
	# Process category product paths (use domain 1 as base and replace with chosen domain)
	for category in index_categories:
		for asin, product in index_categories[category].items():
			product_path = product["file"].replace("domains/1", f"domains/{domain}")
			index_categories[category][asin]["file"] = product_path
			# Add category to product object
			index_categories[category][asin]["category"] = category
	return index_categories

def verify_products(index_categories: dict) -> dict:
	'''
		Verifies that all products in categories are unique and returns a dictionary of domain products.
	'''
	domain_products = {} # asin -> product (simplified object)
	for category in index_categories:
		for asin, product in index_categories[category].items():
			if asin in domain_products:
				print(f"Duplicate product found: {asin}")
				raise Exception(f"Duplicate product found: {asin}")
			else:
				domain_products[asin] = product
	print(f"Total unique products in domain: {len(domain_products)}")
	return domain_products

def get_product(product_filepath: str) -> dict:
	'''
		Loads a product file and returns a product.
	'''
	result_object = json.load(open(product_filepath))
	product = result_object["products"][0]
	product["csv"] = parse_csv(product["csv"])
	product["csv"] = organize_csv(product["csv"])
	product["csv"] = discretize_csv_smart(product["csv"])
	return product

def load_products(category: str, index_categories: dict) -> dict:
	'''
		Loads all products in a category.
	'''
	products = {}
	for i, (asin, product) in enumerate(index_categories[category].items()):
		# if i == 100: # debug purposes
		# 	break
		print(f"Loading product {i+1}/{len(index_categories[category])} ({asin})      ", end="\r")
		try: # ensures that we can skip products that fail to load (e.g. due to missing files across domains)
			loaded_product = get_product(product["file"])
			products[asin] = loaded_product
		except Exception as e:
			# print(f"Failed to load product {i+1}/{len(index_categories[category])} ({asin})")
			# print(e)
			pass
	print("")
	return products

def csv_to_df(csv: dict, domain: str) -> pd.DataFrame:
	'''
		Converts a csv object to a pandas DataFrame.
	'''
	individual_dfs = []
	# create a column "time" with the datetime values
	kept_keys = set(["AMAZON", "NEW", "USED", "EBAY_NEW_SHIPPING", "EBAY_USED_SHIPPING", "COUNT_NEW", "COUNT_USED"])#, "COUNT_REVIEWS"])
	for timeseries_name in csv.keys():
		if timeseries_name not in kept_keys:
			continue
		try:
			prices = csv[timeseries_name][0]
			# If it's not a count calculate the mean value and divide the dataframe by it
			# this is a substitute for calculating index near a date
			if "count" not in timeseries_name.lower():
				if len(prices) == 0:
					continue
				mean_value = np.mean(prices)
				if mean_value == 0 or np.isnan(mean_value):
					continue
				prices = prices / mean_value * 100
			times = csv[timeseries_name][1]
			
			# Remove outliers using stats.zscore
			threshold = 2
			z = np.abs(stats.zscore(prices))
			outliers = z > threshold
			outliers_count = np.sum(outliers) # for debugging
			# Filter out the outliers from the prices and times
			prices = prices[~outliers]
			times = times[~outliers]
			# TODO: impute instead of remove?

			# Create a series and then a dataframe
			series = pd.Series(prices, index=times, name=timeseries_name)
			df = pd.DataFrame(series)

			# Insert rows for missing dates between 2016-08-01 and 2024-01-01
			dates = pd.date_range(start="2016-08-01", end="2024-01-01", freq="D")
			df = df.reindex(dates)
			# replace missing values with NaN
			df.fillna(np.nan, inplace=True)
			# replace -1 with NaN
			df.replace(-1, np.nan, inplace=True)

			# Adjust for CPI if not a count
			if "count" not in timeseries_name.lower():
				df = cpi_adjust.adjust_for_inflation(df, domain_country_map[domain], columns=[timeseries_name])

			# Remove rows with entirely NaN values
			df = df.dropna()
			
			# Add to individual dataframes list
			individual_dfs.append(df)
		except Exception as e:
			pass

	# Concatenate all the individual dataframes on index
	product_df = pd.concat(individual_dfs, axis=1)
	
	# Fill missing values with NaN
	product_df.fillna(np.nan, inplace=True)

	# Cut everything before 2016-08-01
	product_df = product_df["2016-08-01":]
	return product_df

def add_product_info_to_df(product_df: pd.DataFrame, product: dict) -> pd.DataFrame:
	'''
		Adds product information to the DataFrame.
	'''
	if "asin" in product_df.columns: # assume already added
		return product_df
	# Add to front of DataFrame
	product_df.insert(0, "asin", product["asin"])
	product_df.insert(1, "category", product["category"])
	return product_df

# Merge all products into one DataFrame keeping all rows
def merge_category_dfs(product_dfs: list[pd.DataFrame]) -> pd.DataFrame:
	'''
		Merges all product DataFrames into one DataFrame.
	'''
	# Change index for all dfs to a column named "time" as the first column and reset index
	for product_df in product_dfs:
		product_df.insert(0, "time", product_df.index)
		product_df.reset_index(drop=True, inplace=True)
	# Merge all product dfs into one
	merged_df = pd.concat(product_dfs, axis=0)
	# Reset index
	merged_df.reset_index(drop=True, inplace=True)
	return merged_df

def get_category_product_dfs(category_name: str, index_categories: dict, domain_products: dict, domain: str) -> list[pd.DataFrame]:
	'''
		Loads all products in a category and returns them as DataFrames.
	'''
	products = load_products(category_name, index_categories)
	product_dfs = []
	fails = []
	for i, product in enumerate(products.values()):
		print(f"Processing product {i+1}/{len(products)} ({product['asin']})      ", end="\r")
		try:
			product_df = csv_to_df(product["csv"], domain)
			product_df = add_product_info_to_df(product_df, domain_products[product["asin"]])
			if len(product_df) == 0: # some products have no valid data
				fails.append(product["asin"])
			else:
				product_dfs.append(product_df)
		except Exception as e:
			fails.append(product["asin"])
	print("")
	print(f"Successfully loaded {len(product_dfs)}/{len(products)} product dfs")
	return product_dfs

def get_category_product_dfs_merged(category_name: str, index_categories: dict, domain_products: dict, domain: str) -> pd.DataFrame:
	'''
		Loads all products in a category and returns them as a merged DataFrame.
	'''
	print(f"Loading product dfs for category '{category_name}'")
	product_dfs = get_category_product_dfs(category_name, index_categories, domain_products, domain)
	print(f"Merging {len(product_dfs)} product dfs")
	merged_df = merge_category_dfs(product_dfs)
	print(f"Done! Merged {len(merged_df)} rows")
	return merged_df

def get_aggregate_timeseries_df(merged_df: pd.DataFrame) -> pd.DataFrame:
	'''
		Aggregates the merged DataFrame to get a DataFrame mean values for a specific column.
	'''
	# Create a copy
	merged_df =	merged_df.copy()


	# Remove asin and category columns
	merged_df = merged_df.drop(columns=["asin", "category"])

	# if EBAY_NEW_SHIPPING and EBAY_USED_SHIPPING are not present, add them as NaN
	if "EBAY_NEW_SHIPPING" not in merged_df.columns:
		merged_df["EBAY_NEW_SHIPPING"] = np.nan
	if "EBAY_USED_SHIPPING" not in merged_df.columns:
		merged_df["EBAY_USED_SHIPPING"] = np.nan
	# if AMAZON is not present, add it as NaN
	if "AMAZON" not in merged_df.columns:
		merged_df["AMAZON"] = np.nan

	# Group by time and aggregate
	# NEW, USED, AMAZON, EBAY_NEW_SHIPPING, EBAY_USED_SHIPPING by mean
	# COUNT_NEW, COUNT_USED by sum
	df_ts = merged_df.groupby("time").agg({"NEW": "mean",
																					"USED": "mean",
																					"AMAZON": "mean",
																					"EBAY_NEW_SHIPPING": "mean",
																					"EBAY_USED_SHIPPING": "mean",
																					"COUNT_NEW": "sum",
																					"COUNT_USED": "sum"})

	# Linear interpolation for NaN values to eliminate any small gaps
	df_ts.interpolate(method="time", inplace=True)
	
	# Use np.convolve to get the rolling average for all columns
	window = 30 * 3
	avg = np.ones(window) / window
	values_avg = np.convolve(df_ts["NEW"].values, avg, mode="same") # type: ignore
	df_ts["NEW"] = values_avg
	values_avg = np.convolve(df_ts["USED"].values, avg, mode="same") # type: ignore
	df_ts["USED"] = values_avg
	values_avg = np.convolve(df_ts["AMAZON"].values, avg, mode="same") # type: ignore
	df_ts["AMAZON"] = values_avg
	values_avg = np.convolve(df_ts["EBAY_NEW_SHIPPING"].values, avg, mode="same") # type: ignore
	df_ts["EBAY_NEW_SHIPPING"] = values_avg
	values_avg = np.convolve(df_ts["EBAY_USED_SHIPPING"].values, avg, mode="same") # type: ignore
	df_ts["EBAY_USED_SHIPPING"] = values_avg
	# values_avg = np.convolve(df_ts["COUNT_NEW"].values, avg, mode="same")
	# df_ts["COUNT_NEW"] = values_avg
	# values_avg = np.convolve(df_ts["COUNT_USED"].values, avg, mode="same")
	# df_ts["COUNT_USED"] = values_avg

	# Smooth again if not using rolling
	window = 30
	avg = np.ones(window) / window
	values_avg = np.convolve(df_ts["NEW"].values, avg, mode="same") # type: ignore
	df_ts["NEW"] = values_avg
	values_avg = np.convolve(df_ts["USED"].values, avg, mode="same") # type: ignore
	df_ts["USED"] = values_avg
	values_avg = np.convolve(df_ts["AMAZON"].values, avg, mode="same") # type: ignore
	df_ts["AMAZON"] = values_avg
	values_avg = np.convolve(df_ts["EBAY_NEW_SHIPPING"].values, avg, mode="same") # type: ignore
	df_ts["EBAY_NEW_SHIPPING"] = values_avg
	values_avg = np.convolve(df_ts["EBAY_USED_SHIPPING"].values, avg, mode="same") # type: ignore
	df_ts["EBAY_USED_SHIPPING"] = values_avg
	values_avg = np.convolve(df_ts["COUNT_NEW"].values, avg, mode="same") # type: ignore
	df_ts["COUNT_NEW"] = values_avg
	values_avg = np.convolve(df_ts["COUNT_USED"].values, avg, mode="same") # type: ignore
	df_ts["COUNT_USED"] = values_avg

	# Add NaN for missing dates from 2016-08-01 to 2024-01-01
	# Some merged dfs start at 2020 and we want to preserve scale
	dates = pd.date_range(start="2016-08-01", end="2024-01-01", freq="D")
	df_ts = df_ts.reindex(dates)

	# Use linear interpolation to fill NaN values once again
	df_ts.interpolate(method="time", inplace=True)

	# Remove all rows before 2017-01-01
	df_ts = df_ts[df_ts.index >= "2017-01-01"]
	# Remove all after 2023-10-01
	df_ts = df_ts[df_ts.index <= "2023-10-01"]
	return df_ts

def get_fig(df: pd.DataFrame, title: str) -> go.Figure:
	'''
		Returns a plotly figure for a DataFrame.
	'''
	# First row are prices / indices, second row are counts
	fig = make_subplots(rows=2, cols=1, row_heights=[0.8, 0.2], shared_xaxes=True, vertical_spacing=0.02)
	colors = px.colors.qualitative.Plotly
	first_row_columns = ["NEW", "USED", "AMAZON", "EBAY_NEW_SHIPPING", "EBAY_USED_SHIPPING"]
	second_row_columns = ["COUNT_NEW", "COUNT_USED"]
	
	# Add traces
	for i, column in enumerate(first_row_columns):
		fig.add_trace(go.Scatter(x=df.index, y=df[column], mode="lines", name=column, line=dict(color=colors[i])), row=1, col=1)
	df_counts = df[second_row_columns]
	for i, column in enumerate(second_row_columns):
		fig.add_trace(go.Bar(x=df_counts.index, y=df_counts[column], name=column, marker=dict(color=colors[i])), row=2, col=1)
	
	# Bower bar spacing
	fig.update_layout(bargap=0)
	# Remove bar borders
	fig.update_traces(marker_line_width=0, row=2, col=1)
	# Change barmode to stacked
	fig.update_layout(barmode="stack")

	# Update layout
	fig.update_layout(title=title, xaxis_title="Time", yaxis_title="Price")
	fig.update_layout(width=1280, height=720)
	fig.update_layout(legend=dict(traceorder="normal"))

	# Update axes
	fig.update_xaxes(showgrid=True)
	fig.update_xaxes(title_text="Time", row=2, col=1)
	fig.update_yaxes(title_text="Price trends", row=1, col=1)
	fig.update_yaxes(title_text="Count listings", row=2, col=1)
	# Remove first row x-axis title
	fig.update_xaxes(title_text="", row=1, col=1)
	
	# Add events
	for event in events:
		fig = add_vline_annotation(fig, event, textangle=-20)
	
	# Set margins
	fig.update_layout(margin=dict(r=0, t=60, b=0, l=0))
	return fig

def process_domain(domain: str, can_overwrite: bool) -> dict:
	'''
		Processes a domain.

		Returns a dictionary of merged DataFrames for each category with the category name as the key.
	'''
	merged_dfs = {}
	print(f"Processing domain 'id={domain}'")
	# Load index categories
	index_categories = get_index_categories(domain)
	domain_products = verify_products(index_categories)
	# Get merged df for each category
	for i, category in enumerate(index_categories):
		# if category != "VIDEO_CARD": # debug purposes
		# 	continue
		filepath = os.path.join(path_csv_root, f"domain-{domain}-{category}.csv")
		if os.path.exists(filepath) and can_overwrite == False:
			print(f"Loading existing file for category '{category}' ({i+1}/{len(index_categories)})")
			merged_df = pd.read_csv(filepath, index_col=0)
			merged_df["time"] = pd.to_datetime(merged_df["time"])
			merged_dfs[category] = merged_df
		else:
			print(f"Processing category '{category}' ({i+1}/{len(index_categories)})")
			merged_df = get_category_product_dfs_merged(category, index_categories, domain_products, domain)
			merged_dfs[category] = merged_df
			# Save to csv
			merged_df.to_csv(filepath)
	print(f"Done processing domain 'id={domain}'")
	print("")
	return merged_dfs


In [None]:
# Get merged DataFrames for each category in each domain

# Variables
time_start = time.time()
can_overwrite = False # True to refresh, False to use cached data
domains_to_process = ["1", "2", "3", "5"]
# domains_to_process = ["3"] # debug purposes
merged_category_dfs_domains = {}

# Process each domain to get merged DataFrames for each category
for domain in domains_to_process:
	merged_dfs = process_domain(domain, can_overwrite)
	merged_category_dfs_domains[domain] = merged_dfs

time_end = time.time()
print(f"Processed {len(domains_to_process)} domains in {time_end - time_start:.2f} seconds")


In [None]:
def get_row_counts(merged_category_dfs_domains: dict) -> pd.DataFrame:
	'''
		Returns a DataFrame with the row counts for each category in each domain.
	'''
	# set df with columns as domains and rows as categories (category names as index)
	df = pd.DataFrame()
	sorted_category_names = sorted(merged_category_dfs_domains[list(merged_category_dfs_domains.keys())[0]].keys())
	for domain, merged_category_dfs in merged_category_dfs_domains.items():
		for category in sorted_category_names:
			merged_df = merged_category_dfs[category]
			df.loc[category, domain_name_map[domain]] = len(merged_df)
	# add total row (sum of each column)
	df.loc["Total"] = df.sum()
	# convert to int
	df = df.astype(int)
	# convert numbers to text, separated by commas for readability
	df = df.applymap(lambda x: f"{x:,}") # type: ignore
	# add Category column to start and reset index
	df = df.reset_index()
	df.rename(columns={"index": "Category"}, inplace=True)
	return df

# def get_datapoint_counts(merged_category_dfs_domains: dict) -> pd.DataFrame:
# 	'''
# 		Returns a DataFrame with the row counts for each category in each domain.
# 	'''
# 	# set df with columns as domains and rows as categories (category names as index)
# 	df = pd.DataFrame()
# 	sorted_category_names = sorted(merged_category_dfs_domains[list(merged_category_dfs_domains.keys())[0]].keys())
# 	for domain, merged_category_dfs in merged_category_dfs_domains.items():
# 		for category in sorted_category_names:
# 			merged_df = merged_category_dfs[category]
# 			# df.loc[category, domain_name_map[domain]] = len(merged_df)
# 			# get counts of non-Nan values for each column
# 			counts = merged_df.count()
# 			# add to df
# 			if "NEW" in counts.index:
# 				df.loc[category, f"{domain_name_map[domain]} (new)"] = counts["NEW"]
# 			if "USED" in counts.index:
# 				df.loc[category, f"{domain_name_map[domain]} (used)"] = counts["USED"]
# 			if "AMAZON" in counts.index:
# 				df.loc[category, f"{domain_name_map[domain]} (Amazon)"] = counts["AMAZON"]
# 			if "EBAY_NEW_SHIPPING" in counts.index:
# 				df.loc[category, f"{domain_name_map[domain]} (eBay new)"] = counts["EBAY_NEW_SHIPPING"]
# 			if "EBAY_USED_SHIPPING" in counts.index:
# 				df.loc[category, f"{domain_name_map[domain]} (eBay used)"] = counts["EBAY_USED_SHIPPING"]
# 			if "COUNT_NEW" in counts.index:
# 				df.loc[category, f"{domain_name_map[domain]} (count new)"] = counts["COUNT_NEW"]
# 			if "COUNT_USED" in counts.index:
# 				df.loc[category, f"{domain_name_map[domain]} (count used)"] = counts["COUNT_USED"]
# 	# add total row (sum of each column)
# 	df.loc["Total"] = df.sum()
# 	# Replace NaN with 0
# 	df.fillna(0, inplace=True)
# 	# convert to int
# 	df = df.astype(int)
# 	# convert numbers to text, separated by commas for readability
# 	df = df.applymap(lambda x: f"{x:,}") # type: ignore
# 	# add Category column to start and reset index
# 	df = df.reset_index()
# 	df.rename(columns={"index": "Category"}, inplace=True)
# 	return df

def get_products_counts(merged_category_dfs_domains: dict) -> pd.DataFrame:
	'''
		Returns a DataFrame with the row counts for each category in each domain.
	'''
	# set df with columns as domains and rows as categories (category names as index)
	df = pd.DataFrame()
	sorted_category_names = sorted(merged_category_dfs_domains[list(merged_category_dfs_domains.keys())[0]].keys())
	for domain, merged_category_dfs in merged_category_dfs_domains.items():
		for category in sorted_category_names:
			merged_df = merged_category_dfs[category]
			# get counts of non-Nan values for each column
			products = len(merged_df["asin"].unique())
			df.loc[category.replace("NETWORK_INTERFACE_CONTROLLER", "NIC"), domain_name_map[domain]] = products
	df.loc["Domain total"] = df.sum()
	# Replace NaN with 0
	df.fillna(0, inplace=True)
	# convert to int
	df = df.astype(int)
	# convert numbers to text, separated by commas for readability
	df = df.applymap(lambda x: f"{x:,}") # type: ignore
	# add Category column to start and reset index
	df = df.reset_index()
	df.rename(columns={"index": "Category"}, inplace=True)
	return df

def get_datapoint_counts(merged_category_dfs_domains: dict) -> pd.DataFrame:
	'''
		Returns a DataFrame with the row counts for each category in each domain.
	'''
	# set df with columns as domains and rows as categories (category names as index)
	df = pd.DataFrame()
	sorted_category_names = sorted(merged_category_dfs_domains[list(merged_category_dfs_domains.keys())[0]].keys())
	for domain, merged_category_dfs in merged_category_dfs_domains.items():
		for category in sorted_category_names:
			merged_df = merged_category_dfs[category]
			# get counts of non-Nan values for each column
			counts = merged_df.count()
			products = len(merged_df["asin"].unique())
			counts_sum = 0
			valid_columns = 0
			if "NEW" in counts.index:
				counts_sum += counts["NEW"]
				valid_columns += 1
			if "USED" in counts.index:
				counts_sum += counts["USED"]
				valid_columns += 1
			if "AMAZON" in counts.index:
				counts_sum += counts["AMAZON"]
				valid_columns += 1
			if "EBAY_NEW_SHIPPING" in counts.index:
				counts_sum += counts["EBAY_NEW_SHIPPING"]
				valid_columns += 1
			if "EBAY_USED_SHIPPING" in counts.index:
				counts_sum += counts["EBAY_USED_SHIPPING"]
				valid_columns += 1
			if "COUNT_NEW" in counts.index:
				counts_sum += counts["COUNT_NEW"]
				valid_columns += 1
			if "COUNT_USED" in counts.index:
				counts_sum += counts["COUNT_USED"]
				valid_columns += 1
			df.loc[category.replace("NETWORK_INTERFACE_CONTROLLER", "NIC"), domain_name_map[domain]] = counts_sum
	df.loc["Domain total"] = df.sum()
	# Replace NaN with 0
	df.fillna(0, inplace=True)
	# convert to int
	df = df.astype(int)
	# convert numbers to text, separated by commas for readability
	df = df.applymap(lambda x: f"{x:,}") # type: ignore
	# add Category column to start and reset index
	df = df.reset_index()
	df.rename(columns={"index": "Category"}, inplace=True)
	return df

def get_datapoint_percentage(merged_category_dfs_domains: dict) -> pd.DataFrame:
	'''
		Returns a DataFrame with the row counts for each category in each domain.
	'''
	# set df with columns as domains and rows as categories (category names as index)
	df = pd.DataFrame()
	sorted_category_names = sorted(merged_category_dfs_domains[list(merged_category_dfs_domains.keys())[0]].keys())
	for domain, merged_category_dfs in merged_category_dfs_domains.items():
		for category in sorted_category_names:
			merged_df = merged_category_dfs[category]
			# get counts of non-Nan values for each column
			counts = merged_df.count()
			products = len(merged_df["asin"].unique())
			counts_sum = 0
			valid_columns = 0
			if "NEW" in counts.index:
				counts_sum += counts["NEW"]
				valid_columns += 1
			if "USED" in counts.index:
				counts_sum += counts["USED"]
				valid_columns += 1
			if "AMAZON" in counts.index:
				counts_sum += counts["AMAZON"]
				valid_columns += 1
			if "EBAY_NEW_SHIPPING" in counts.index:
				counts_sum += counts["EBAY_NEW_SHIPPING"]
				valid_columns += 1
			if "EBAY_USED_SHIPPING" in counts.index:
				counts_sum += counts["EBAY_USED_SHIPPING"]
				valid_columns += 1
			if "COUNT_NEW" in counts.index:
				counts_sum += counts["COUNT_NEW"]
				valid_columns += 1
			if "COUNT_USED" in counts.index:
				counts_sum += counts["COUNT_USED"]
				valid_columns += 1
			days = 2710 # start="2016-08-01", end="2024-01-01"
			# each column can have at most n-days data points per product - calculate percentage of datapoints
			percentage = counts_sum / (valid_columns * days * products) * 100
			df.loc[category.replace("NETWORK_INTERFACE_CONTROLLER", "NIC"), domain_name_map[domain]] = percentage
	df.loc["Domain average"] = df.mean()
	# convert numbers to text, separated by commas for readability
	df = df.applymap(lambda x: f"{round(x, 2)}%") # type: ignore
	# add Category column to start and reset index
	df = df.reset_index()
	df.rename(columns={"index": "Category"}, inplace=True)
	return df



# Get row counts for each category in each domain
# row_counts = get_row_counts(merged_category_dfs_domains)
# row_counts
products_counts = get_products_counts(merged_category_dfs_domains)
# products_counts
datapoint_counts = get_datapoint_counts(merged_category_dfs_domains)
# datapoint_counts
datapoint_percentage = get_datapoint_percentage(merged_category_dfs_domains)
# datapoint_percentage

# # Merge all three dataframes into one using columns to create multiindex data frame - subcolumns are "P" for product counts, "D" for data points, "DP" for data point percentage
# # reindex
# products_counts.set_index("Category", inplace=True)
# datapoint_counts.set_index("Category", inplace=True)
# datapoint_percentage.set_index("Category", inplace=True)
# merged_df = pd.concat([products_counts, datapoint_counts, datapoint_percentage], axis=1)
# # rename columns
# # merged_df.columns = pd.MultiIndex.from_tuples([(col, "") for col in merged_df.columns])
# # rename columns - first could be "P" for product counts, "D" for data points, "DP" for data point percentage
# merged_df.columns = pd.MultiIndex.from_tuples([(col, "") for col in merged_df.columns])
# # sort columns
# merged_df = merged_df.sort_index(axis=1)
# # # reset index
# merged_df = merged_df.reset_index()
# # # rename columns
# merged_df.rename(columns={"": "Category"}, inplace=True)
# # # add total row
# # merged_df.loc["Total"] = merged_df.sum()
# # rename top level index to "Domain"
# merged_df.columns = merged_df.columns.set_names(["Domain", ""])
# merged_df

path_output = os.path.join(path_output_root, "stats-products.csv")
products_counts.to_csv(path_output, index=False, sep="\\")
path_output = os.path.join(path_output_root, "stats-datapoints.csv")
datapoint_counts.to_csv(path_output, index=False, sep="\\")
path_output = os.path.join(path_output_root, "stats-datapoints-percentage.csv")
datapoint_percentage.to_csv(path_output, index=False, sep="\\")

# set category as index for all three
products_counts.set_index("Category", inplace=True)
datapoint_counts.set_index("Category", inplace=True)
datapoint_percentage.set_index("Category", inplace=True)
# merge
merged_counts = pd.concat([products_counts, datapoint_counts, datapoint_percentage], axis=1)
# reset index
merged_counts = merged_counts.reset_index()
# replace NaN with /
merged_counts.fillna("/", inplace=True)
# Join columns with the same names -
# Save to csv
path_output = os.path.join(path_output_root, "stats.csv")
merged_counts.to_csv(path_output, index=False, sep="\\")
merged_counts

In [None]:
df_video_card_1 = merged_category_dfs_domains["1"]["VIDEO_CARD"]
# df_video_card_1.head()
# df_video_card_1.describe()
# df_video_card_1.dtypes

In [None]:
# Check memory of the total merged_category_dfs_domains
total_memory_usage = 0
for domain, merged_dfs in merged_category_dfs_domains.items():
	for category, df in merged_dfs.items():
		total_memory_usage += df.memory_usage().sum()
print(f"Total memory usage: {total_memory_usage / 1024 / 1024:.2f} MB")

In [None]:
# # Explore single category
# df = merged_category_dfs_domains["2"]["CHARGING_ADAPTER"]
# df = get_aggregate_timeseries_df(df)
# # df.head()
# fig = get_fig(df, "Charging adapter trends")
# fig.show()

In [None]:
# Generate plots for each domain
can_overwrite = False # True to regenerate plots, False to skip
if os.path.exists(os.path.join(path_output_root, "categories-per-domain")) == False:
	os.makedirs(os.path.join(path_output_root, "categories-per-domain"))
time_start = time.time()
for i, (domain, merged_dfs) in enumerate(merged_category_dfs_domains.items()):
	for j, (category, merged_df) in enumerate(merged_dfs.items()):
		path_output = os.path.join(*[path_output_root, "categories-per-domain", f"category-{category}-domain-{domain}.png"])
		if os.path.exists(path_output) and can_overwrite == False:
			print(f"{i * len(merged_dfs) + j + 1}/{len(domains_to_process) * len(merged_dfs)} Skipping plot for category '{category}' for domain '{domain_name_map[domain]}' (id={domain})")
			continue
		print(f"{i * len(merged_dfs) + j + 1}/{len(domains_to_process) * len(merged_dfs)} Generating plot for category '{category}' for domain '{domain_name_map[domain]}' (id={domain})")	
		# Get product count
		valid_products_count = len(merged_df["asin"].unique())
		print(f"- Valid products: {valid_products_count}")
		# Get aggregate timeseries df
		print(f"- Generating aggregate timeseries...")
		df_ts = get_aggregate_timeseries_df(merged_df.copy()) # copy just in case
		# Plot
		print(f"- Generating plot...")
		title = f"Aggregate product price history trends for category '{category}' for domain '{domain_name_map[domain]}' ({valid_products_count} products)"
		fig = get_fig(df_ts, title)
		# fig.show()
		# Save as png with 3x scale
		print(f"- Saving plot...")
		fig.write_image(path_output, scale=3)
		print(f"- Saved plot to '{path_output}'")
print("")
time_end = time.time()
print(f"Generated plots for {len(domains_to_process)} domains in {time_end - time_start:.2f} seconds")

In [None]:
# TODO: Generate plots for each category by merging all domains for that category into a single DataFrame

# Merge all domains for each category
if not os.path.exists(os.path.join(path_output_root, "categories-all-domains")):
	os.makedirs(os.path.join(path_output_root, "categories-all-domains"))
selected_domains = domains_to_process
selected_categories = list(merged_category_dfs_domains["1"].keys())
can_overwrite = False # True to refresh, False skip
time_start = time.time()
merged_category_dfs_categories = {}
product_counts = {}
print(f"Selected domains: {selected_domains}")
print(f"Selected categories: {selected_categories}")
print(f"Merging all domains for each category")
for i, (domain, categories) in enumerate(merged_category_dfs_domains.items()):
	for j, (category, df) in enumerate(categories.items()):
		if i == 0:
			merged_category_dfs_categories[category] = df.copy()
			product_counts[category] = len(df["asin"].unique())
		else:
			# just add rows to merged_category_dfs_categories[category]
			df_to_add = df.copy()
			merged_category_dfs_categories[category] = pd.concat([merged_category_dfs_categories[category], df_to_add], ignore_index=True)
			product_counts[category] += len(df["asin"].unique())
# Generate plots for each category
for i, (category, df) in enumerate(merged_category_dfs_categories.items()):
	path_output = os.path.join(*[path_output_root, "categories-all-domains", f"category-{category}.png"])
	if os.path.exists(path_output) and can_overwrite == False:
		print(f"{i+1}/{len(merged_category_dfs_categories)} Skipping plot for category '{category}' for all domains")
		continue
	print(f"{i+1}/{len(merged_category_dfs_categories)} Generating plot for category '{category}' for all domains")
	# Get product count - TODO: fix this to use counts from individual merged dfs
	valid_products_count = product_counts[category]
	print(f"- Valid products: {valid_products_count}")
	# Get aggregate timeseries df
	print(f"- Generating aggregate timeseries...")
	df_ts = get_aggregate_timeseries_df(df.copy()) # copy just in case
	# Plot
	print(f"- Generating plot...")
	title = f"Aggregate product price history trends for category '{category}' for all domains ({valid_products_count} products)"
	fig = get_fig(df_ts, title)
	# fig.show()
	# Save as png with 3x scale
	print(f"- Saving plot...")
	fig.write_image(path_output, scale=3)
	print(f"- Saved plot to '{path_output}'")

In [None]:
# Clear some memory by deleting the merged_category_dfs_categories
try:
	del merged_category_dfs_categories
except Exception as e:
	print("Already deleted")

In [None]:
# TODO: Generate plots for entire domain by merging all categories of a domain into a single DataFrame
if not os.path.exists(os.path.join(path_output_root, "domains-all-categories")):
	os.makedirs(os.path.join(path_output_root, "domains-all-categories"))
merged_domains = {}
product_counts = {}
can_overwrite = False # True to refresh, False to skip
time_start = time.time()
print(f"Merging all categories for each domain")
for i, (domain, categories) in enumerate(merged_category_dfs_domains.items()):
	for j, (category, df) in enumerate(categories.items()):
		if j == 0:
			merged_domains[domain] = df.copy()
			product_counts[domain] = len(df["asin"].unique())
		else:
			# just add rows to merged_domains[domain]
			df_to_add = df.copy()
			merged_domains[domain] = pd.concat([merged_domains[domain], df_to_add], ignore_index=True)
			product_counts[domain] += len(df["asin"].unique())
# Generate plots for each domain
for i, (domain, df) in enumerate(merged_domains.items()):
	path_output = os.path.join(*[path_output_root, "domains-all-categories", f"domain-{domain}.png"])
	if os.path.exists(path_output) and can_overwrite == False:
		print(f"{i+1}/{len(merged_domains)} Skipping plot for domain '{domain_name_map[domain]}'")
		continue
	print(f"{i+1}/{len(merged_domains)} Generating plot for domain '{domain_name_map[domain]}'")
	# Get product count - TODO: fix this to use counts from individual merged dfs
	valid_products_count = product_counts[domain]
	print(f"- Valid products: {valid_products_count}")
	# Get aggregate timeseries df
	print(f"- Generating aggregate timeseries...")
	df_ts = get_aggregate_timeseries_df(df.copy()) # copy just in case
	# Plot
	print(f"- Generating plot...")
	title = f"Aggregate product price history trends for all product categories for domain '{domain_name_map[domain]}' ({valid_products_count} products)"
	fig = get_fig(df_ts, title)
	# fig.show()
	# Save as png with 3x scale
	print(f"- Saving plot...")
	fig.write_image(path_output, scale=3)
	print(f"- Saved plot to '{path_output}'")

In [None]:
# TODO: Generate a plot for all categories and all domains - the big one
df_big = merged_domains["1"].copy()
for domain, df in merged_domains.items():
	if domain == "1":
		continue
	df_big = pd.concat([df_big, df], ignore_index=True)
total_products_count = sum(product_counts.values())
path_output = os.path.join(*[path_output_root, "all-domains-products", "all-domains-products.png"])
# create folder if it doesn't exist
if not os.path.exists(os.path.join(path_output_root, "all-domains-products")):
	os.makedirs(os.path.join(path_output_root, "all-domains-products"))
# Generate plot
print(f"Generating plot for all domains and all categories")
# Get aggregate timeseries df
print(f"- Generating aggregate timeseries...")
df_ts = get_aggregate_timeseries_df(df_big)
# Plot
print(f"- Generating plot...")
title = f"Aggregate product price history trends for all product categories for all domains ({total_products_count} products)"
fig = get_fig(df_ts, title)
# Save as png with 3x scale
print(f"- Saving plot...")
fig.write_image(path_output, scale=3)
print(f"- Saved plot to '{path_output}'")
fig.show()