In [None]:
import os
import sys
import json
import time
import pandas as pd
import numpy as np
from scipy import stats
from plotly import graph_objects as go
from plotly.subplots import make_subplots
from plotly import express as px
from scipy import stats


# Project imports
sys.path.append(os.getcwd())
# from src.py.utils.generic_utils import wrapper
from src.py.scraping.keepa.keepa_analysis_utils import load_result_object, parse_csv, organize_csv, discretize_csv_smart, get_trends, get_timeseries_from_trends, remove_outliers, remove_outliers_csv

# Finance utils
from src.py.analysis.yahoo.stocks.finance_df_utils import add_vline_annotation
# Events
from src.py.analysis.events import events
# CPI
import importlib
sys.path.append(os.getcwd())
cpi_adjust = importlib.import_module("src.py.scraping.world-bank.cpi_adjust")
cpi_adjust.initialize_cpi(date_cutoff="2016-08-01", jagged=False) # jagged=True)

In [None]:
# For prior analysis check src/py/scraping/keepa/category_analysis.ipynb

In [None]:
# Meaning of the columns
# https://keepaapi.readthedocs.io/en/latest/api_methods.html
# https://github.com/ukrexpo/keepa/blob/master/keepa.py

In [None]:
path_category_index = "data/keepa/generated/categories-domain-1.json"
path_visualizations_root = "data/keepa/generated/visualizations"

In [None]:
domain = "1" # amazon.com
domain_name_map = { # for fig title
	"1": "amazon.com",
	"2": "amazon.co.uk",
	"3": "amazon.de",
	"5": "amazon.co.jp",
}
domain_country_map = { # for CPI and filename
	"1": "USA",
	"2": "GBR",
	"3": "DEU",
	"5": "JPN",
}
print(f"Working with domain_id '{domain}'")
print(f"Working with domain '{domain_name_map[domain]}'")
print(f"Working with domain country '{domain_country_map[domain]}'")
index_categories = json.load(open(path_category_index))
print(f"Loaded {len(index_categories)} categories:")
for category in index_categories:
	product_count = len(index_categories[category])
	print(f"- {category} ({product_count} products)")

# Process category product paths
for category in index_categories:
	for asin, product in index_categories[category].items():
		product_path = product["file"].replace("domain/1", f"domain/{domain}")
		index_categories[category][asin]["file"] = product_path
		# Add category to product object
		index_categories[category][asin]["category"] = category
print("")
print(f"Replaced 'domain/1' with 'domain/{domain}' in product paths")

In [None]:
# Verify that all products in categories are unique
category_products = {} # asin -> product (simplified object)
for category in index_categories:
	for asin, product in index_categories[category].items():
		if asin in category_products:
			print(f"Duplicate product found: {asin}")
		else:
			category_products[asin] = product

print(f"Total unique products: {len(category_products)}")

In [None]:
def get_products_by_title(title):
	results = []
	for asin, product in category_products.items():
		if title.lower().replace(" ", "") in product["title"].lower().replace(" ", ""):
			results.append(asin)
	return results

query = "ryzen 7 2700x"
sample_results = get_products_by_title(query)
print(f"Found {len(sample_results)} products matching '{query}':")
for asin in sample_results:
	print(f"- {category_products[asin]['title']} ({asin}) ; {category_products[asin]['category']}")

In [None]:
def get_product(product_filepath: str) -> dict:
	'''
		Loads a product file and returns a product.
	'''
	result_object = json.load(open(product_filepath))
	product = result_object["products"][0]
	product["csv"] = parse_csv(product["csv"])
	product["csv"] = organize_csv(product["csv"])
	product["csv"] = discretize_csv_smart(product["csv"])
	return product

sample_product_asin = "B07B428M7F"
sample_product = get_product(category_products[sample_product_asin]["file"])
# print(json.dumps(sample_product, indent=2, default=str))


In [None]:
# Print timeseries names
print("Timeseries names:")
for name in sample_product["csv"].keys():
	print(f"- {name}")

In [None]:

# Plot sample product NEW price history
fig = go.Figure()
timeseries_name = "NEW" #"COUNT_NEW"
prices = sample_product["csv"][timeseries_name][0]
times = sample_product["csv"][timeseries_name][1]
print(f"Data points: {len(prices)}")
fig.add_trace(go.Scatter(x=times, y=prices, mode="lines+markers", name="Price"))
fig.update_layout(title=f"Product price 'NEW' of {sample_product['title']} ({sample_product['asin']})<br>Data points: {len(prices)}", xaxis_title="Time", yaxis_title="Price")
fig.update_xaxes(range=["2018-04-01", "2023-11-20"])
fig.update_layout(width=1200, height=500)
fig.update_layout(margin=dict(l=0, r=10, t=60, b=0))
path_output = os.path.join(path_visualizations_root, f"sample-product-new-{sample_product['asin']}.png")
fig.write_image(path_output, scale=2)
fig.show()

In [None]:
# Plot sample product NEW - using z-score to remove outliers
fig = go.Figure()
import plotly.express as px
# fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.02)
timeseries_name = "NEW"
prices = sample_product["csv"][timeseries_name][0]
times = sample_product["csv"][timeseries_name][1]
fig.add_trace(go.Scatter(x=times, y=prices, mode="lines+markers", name="Price", line=dict(color=px.colors.qualitative.Plotly[1])))
threshold = 2 #1.8 # 2
z = np.abs(stats.zscore(prices))
outliers = z > threshold
outliers_count = np.sum(outliers)
# filter out the outliers from the prices and times
prices = prices[~outliers]
times = times[~outliers]
print(f"Data points: {len(prices)} (removed {outliers_count} outliers)")
fig.add_trace(go.Scatter(x=times, y=prices, mode="lines+markers", name=f"Price t={threshold}", line=dict(color=px.colors.qualitative.Plotly[0])))
fig.update_layout(title=f"Product price 'NEW' of {sample_product['title']} ({sample_product['asin']})<br>Data points: {len(prices)} (removed {outliers_count} outliers)", xaxis_title="Time", yaxis_title="Price")
fig.update_xaxes(range=["2018-04-01", "2023-11-20"])
fig.update_layout(width=1200, height=500)
fig.update_layout(margin=dict(l=0, r=10, t=60, b=0))
path_output = os.path.join(path_visualizations_root, f"sample-product-new-outliers-zscore-{str(threshold).replace('.', 'dot')}-{sample_product['asin']}.png")
fig.write_image(path_output, scale=2)
fig.show()

In [None]:
# Plot sample product NEW price history
fig = go.Figure()
timeseries_name = "NEW"
csv_dev_3 = remove_outliers_csv(sample_product["csv"], max_std_multiplier=2)
prices = csv_dev_3[timeseries_name][0]
times = csv_dev_3[timeseries_name][1]
print(f"Data points: {len(prices)} (removed {len(sample_product['csv'][timeseries_name][0]) - len(prices)} outliers)")
fig.add_trace(go.Scatter(x=times, y=prices, mode="lines", name="Price"))
fig.update_layout(title=f"Price history of {sample_product['title']} ({sample_product['asin']})", xaxis_title="Time", yaxis_title="Price")
fig.show()

# TODO

# DONE

- change algo for outliers - moving average / moving std (use z-score)


In [None]:
category = "COMPUTER_PROCESSOR"

def load_products(category: str, index_categories: dict) -> dict:
	'''
		Loads all products in a category.
	'''
	products = {}
	for i, (asin, product) in enumerate(index_categories[category].items()):
		# if i == 100: # debug purposes
		# 	break
		print(f"Loading product {i+1}/{len(index_categories[category])} ({asin})      ", end="\r")
		products[asin] = get_product(product["file"])
	print("")
	return products


In [None]:

products = load_products(category, index_categories)
print(f"Loaded {len(products)} products in category '{category}'")

In [None]:
# def csv_to_df(csv: dict) -> pd.DataFrame:
# 	'''
# 		Converts a csv object to a pandas DataFrame.
# 	'''
# 	individual_dfs = []
# 	# create a column "time" with the datetime values
# 	for timeseries_name in csv.keys():
# 		try:
# 			prices = csv[timeseries_name][0]
# 			times = csv[timeseries_name][1]
# 			series = pd.Series(prices, index=times, name=timeseries_name)
# 			df = pd.DataFrame(series)
# 			# insert rows for missing dates
# 			df = df.resample("D").asfreq()
# 			# replace missing values with NaN
# 			df.fillna(np.nan, inplace=True)
# 			# replace -1 with NaN
# 			df.replace(-1, np.nan, inplace=True)
# 			# # resample to weekly frequency
# 			# df = df.resample("W").mean()
# 			# fill NaN values by linear interpolation between the closest non-NaN values
# 			df.interpolate(method="time", inplace=True)
# 			threshold = 3
# 			df = df[(np.abs(stats.zscore(df)) < threshold).all(axis=1)]
# 			# smooth the data by taking the rolling mean with a window of 20
# 			df = df.rolling(window=20).mean()
# 			individual_dfs.append(df)
# 		except Exception as e:
# 			pass
# 	# concatenate all the individual dataframes on index
# 	# product_df = pd.concat(individual_dfs, axis=1)
# 	# product_df = pd.DataFrame(individual_dfs).transpose()
# 	product_df = pd.concat(individual_dfs, axis=1)
# 	# fill missing values with NaN
# 	product_df.fillna(np.nan, inplace=True)
# 	# fill NaN values by linear interpolation between the closest non-NaN values
# 	product_df.interpolate(method="time", inplace=True)
# 	# # smooth the data by taking the rolling mean with a window of 20
# 	# product_df = product_df.rolling(window=20).mean()
# 	return product_df


def csv_to_df(csv: dict) -> pd.DataFrame:
	'''
		Converts a csv object to a pandas DataFrame.
	'''
	individual_dfs = []
	# create a column "time" with the datetime values
	kept_keys = set(["AMAZON", "NEW", "USED", "EBAY_NEW_SHIPPING", "EBAY_USED_SHIPPING", "COUNT_NEW", "COUNT_USED"])#, "COUNT_REVIEWS"])
	for timeseries_name in csv.keys():
		if timeseries_name not in kept_keys:
			continue
		try:
			prices = csv[timeseries_name][0]
			#if True: #"count" not in timeseries_name.lower():
			if "count" not in timeseries_name.lower():
				if len(prices) == 0:
					continue
				# get mean value from prices numpy array
				mean_value = np.mean(prices)
				# divide the dataframe by the mean value and multiply by 100
				if mean_value == 0 or np.isnan(mean_value):
					continue
				prices = prices / mean_value * 100
			times = csv[timeseries_name][1]
			
			# Remove outliers using stats.zscore
			threshold = 2
			z = np.abs(stats.zscore(prices))
			outliers = z > threshold
			outliers_count = np.sum(outliers)
			# filter out the outliers from the prices and times
			prices = prices[~outliers]
			times = times[~outliers]
			# TODO: impute instead of remove?

			series = pd.Series(prices, index=times, name=timeseries_name)
			df = pd.DataFrame(series)
			# insert rows for missing dates
			# df = df.resample("D").asfreq()
			# insert rows for missing dates between 2019-01-01 and 2024-01-01
			dates = pd.date_range(start="2016-08-01", end="2024-01-01", freq="D")
			df = df.reindex(dates)
			# replace missing values with NaN
			df.fillna(np.nan, inplace=True)
			# replace -1 with NaN
			df.replace(-1, np.nan, inplace=True)


			# # fill NaN values by linear interpolation between the closest non-NaN values
			# df.interpolate(method="time", inplace=True)
			# threshold = 3
			# df = df[(np.abs(stats.zscore(df)) < threshold).all(axis=1)]
			# # smooth the data by taking the rolling mean with a window of 20
			# # df = df.rolling(window=20).mean()
			# # # resample to weekly frequency
			# df = df.resample("M").mean()

			# Resample to weekly frequency
			frequency = "W"
			# this is incorrect as COUNT_NEW / COUNT_USED are the number of new/used offers, not the number of sales
			# if "count" in timeseries_name.lower():
			# 	df = df.resample(frequency).sum()
			# else:
			# 	df = df.resample(frequency).mean()
			# Correct:
			# df = df.resample(frequency).mean()

			# # Find the closes index to "2019-01-01" ; 2019-01-01 = 100
			# # and set corresponding value to 100 and divide all other values by this value
			# fixed_date_string = "2019-01-01"
			# fixed_date = pd.to_datetime(fixed_date_string)
			# fixed_date_timestamp = fixed_date.timestamp()
			# closest_index = pd.to_datetime(df.index).astype(int) / 10**9 - fixed_date_timestamp
			# closest_index = closest_index.to_numpy()
			# closest_index = np.abs(closest_index).argmin()
			# # find value at index
			# date_at_index = df.index[closest_index]
			# # get difference in days
			# days_difference = (date_at_index - fixed_date).days
			# # reject the product if the index is too far from the date
			# if days_difference > 7: # which number to set
			# 	# reject the product
			# 	continue
			# 	# raise Exception(f"Index too far from {fixed_date_string}")
			# value_at_index = df.iloc[closest_index].values[0]
			# # divide the dataframe by the value at the index and multiply by 100
			# df = df / value_at_index * 100
	
			# NOTE: moved up
			# # Best / most consistent so far
			# if "count" not in timeseries_name.lower():
			# 	# get mean value from prices numpy array
			# 	mean_value = np.mean(prices)
			# 	# divide the dataframe by the mean value and multiply by 100
			# 	df = df / mean_value * 100

			# adjust for CPI
			# df = cpi_adjust.adjust_for_inflation(df, "USA", columns=[timeseries_name])
			if "count" not in timeseries_name.lower():
				df = cpi_adjust.adjust_for_inflation(df, domain_country_map[domain], columns=[timeseries_name])

			# Remove NaN values
			# df = df.dropna()
			
			# add to individual dataframes list
			individual_dfs.append(df)
		except Exception as e:
			pass
	# concatenate all the individual dataframes on index
	product_df = pd.concat(individual_dfs, axis=1)
	
	# fill missing values with NaN
	product_df.fillna(np.nan, inplace=True)
	
	# count reviews is problematic - needs to be resolved - negative reviews, swings by thousands per day, ...
	# omit for now
	# # count reviews is cumulative, so we need to convert to daily count
	# product_df["COUNT_REVIEWS"] = product_df["COUNT_REVIEWS"].diff()
	# # sometimes the values are negative - replace all negative values with NaN
	# product_df["COUNT_REVIEWS"].clip(lower=0, inplace=True)

	# resample all but counts to weekly frequency with mean and the counts to weekly frquency with sum


	# cut everything before 2018-09-01
	# product_df = product_df["2018-09-01":]
	# product_df = product_df["2017-01-01":]
	product_df = product_df["2016-08-01":]
	return product_df

product_df = csv_to_df(sample_product["csv"])
# product_df.head()
# from 2019-01-01 display first 5 rows
product_df["2019-01-01":].head()

In [None]:
def get_percent_non_null(df: pd.DataFrame, timeseries_name: str) -> float:
	'''
		Returns the percentage of non-null values in a timeseries.
	'''
	non_null_count = df[timeseries_name].count()
	total_count = df[timeseries_name].size
	return non_null_count / total_count * 100

timeseries_name = "NEW"
print(f"Percentage of non-null values in timeseries '{timeseries_name}': {get_percent_non_null(product_df, timeseries_name):.2f}%")
print("")

# print df info
print(product_df.info())

In [None]:
product_df = product_df["2017-01-01":]
product_df.head()

In [None]:
# Plot all columns with lines (continous - no gaps)
fig = go.Figure()
for column in product_df.columns:
	fig.add_trace(go.Scatter(x=product_df.index, y=product_df[column], mode="lines+markers", name=column))
fig.update_layout(title=f"Product history of {sample_product['title']} ({sample_product['asin']})", xaxis_title="Time", yaxis_title="Values")
fig.update_xaxes(range=["2018-04-01", "2023-11-20"])
fig.update_layout(width=1200, height=500)
fig.update_layout(margin=dict(l=0, r=0, t=50, b=0))
path_output = os.path.join(path_visualizations_root, f"sample-product-history-{sample_product['asin']}.png")
fig.write_image(path_output, scale=2)
fig.show()

# TODO

- fix COUNT_REVIEWS (and similar columns) which can only increase over time (or at least not decrease) - for each value remove all later values which are smaller
- add COUNT_REVIEWS_CHANGE - how much the value of COUNT_REVIEWS changed from the previous day

In [None]:
# Print product_df size in MB
print(f"Size of product_df: {product_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Resample to weekly
def resample(product_df: pd.DataFrame, freq: str) -> pd.DataFrame:
	'''
		Resamples the DataFrame to the specified frequency.
	'''
	product_df = product_df.resample(freq).mean()
	# if W, change to first day of week
	if freq == "W":
		product_df.index = product_df.index.to_period("W").to_timestamp()
	# if M, change to first day of month
	if freq == "M":
		product_df.index = product_df.index.to_period("M").to_timestamp()
	# product_df.fillna(method="ffill", inplace=True)
	# product_df.fillna(method="bfill", inplace=True)
	return product_df

product_df_W = resample(product_df, "W")
product_df_M = resample(product_df, "M")
product_df_W.head()

In [None]:
# plot NEW
fig = go.Figure()
column_name = "NEW" #"COUNT_NEW"
translate_y = 30
# cut all before 2017-01-01 and after 2024-01-01
product_df = product_df["2017-01-01":"2023-12-01"]
# get percentage of non-null values in the column new for each resampled dataframe
percent_non_null_original = get_percent_non_null(product_df, column_name)
percent_non_null_W = get_percent_non_null(product_df_W, column_name)
percent_non_null_M = get_percent_non_null(product_df_M, column_name)
title_bottom = f"Non-missing values: {percent_non_null_original:.2f}% (original), {percent_non_null_W:.2f}% (resampled to W), {percent_non_null_M:.2f}% (resampled to M)"
fig.add_trace(go.Scatter(x=product_df.index, y=product_df[column_name], mode="lines+markers", name=f"{column_name}"))
fig.add_trace(go.Scatter(x=product_df_W.index, y=product_df_W[column_name] + translate_y, mode="lines+markers", name=f"{column_name} (W)"))
fig.add_trace(go.Scatter(x=product_df_M.index, y=product_df_M[column_name] + (translate_y * 2), mode="lines+markers", name=f"{column_name} (M)"))
fig.update_layout(title=f"Product history of {sample_product['title']} ({sample_product['asin']})<br>{title_bottom}", xaxis_title="Time", yaxis_title="Value")
fig.update_xaxes(range=["2017-01-01", "2023-12-01"])
fig.update_layout(width=1200, height=500)
fig.update_layout(margin=dict(l=0, r=0, t=60, b=0))
path_output = os.path.join(path_visualizations_root, f"sample-product-history-resampling-{sample_product['asin']}.png")
fig.write_image(path_output, scale=2)
fig.show()

In [None]:
def add_product_info_to_df(product_df: pd.DataFrame, product: dict) -> pd.DataFrame:
	'''
		Adds product information to the DataFrame.
	'''
	if "asin" in product_df.columns: # assume already added
		return product_df
	# add to front of DataFrame
	product_df.insert(0, "asin", product["asin"])
	product_df.insert(1, "category", product["category"])
	# product_df["asin"] = product["asin"]
	# product_df["category"] = product["category"]
	return product_df

product_df = add_product_info_to_df(product_df, category_products[sample_product["asin"]])
product_df.head()

In [None]:
# merge all products into one DataFrame keeping all rows

def merge_category_dfs(product_dfs: list[pd.DataFrame]) -> pd.DataFrame:
	'''
		Merges all product DataFrames into one DataFrame.
	'''
	# change index for all dfs to a column named "time" as the first column and reset index
	for product_df in product_dfs:
		product_df.insert(0, "time", product_df.index)
		product_df.reset_index(drop=True, inplace=True)
	# merge all product dfs into one
	merged_df = pd.concat(product_dfs, axis=0)
	# reset index
	merged_df.reset_index(drop=True, inplace=True)
	return merged_df

def get_category_product_dfs(category_name: str, index_categories: dict, category_products: dict) -> list[pd.DataFrame]:
	'''
		Loads all products in a category and returns them as DataFrames.
	'''
	products = load_products(category_name, index_categories)
	product_dfs = []
	fails = []
	for i, product in enumerate(products.values()):
		print(f"Processing product {i+1}/{len(products)} ({product['asin']})      ", end="\r")
		product_df = csv_to_df(product["csv"])
		product_df = add_product_info_to_df(product_df, category_products[product["asin"]])
		if len(product_df) == 0: # some products have no valid data
			fails.append(product["asin"])
		else:
			product_dfs.append(product_df)
	print("")
	print(f"Successfully loaded {len(product_dfs)}/{len(products)} product dfs")
	return product_dfs

def get_category_product_dfs_merged(category_name: str, index_categories: dict, category_products: dict) -> pd.DataFrame:
	'''
		Loads all products in a category and returns them as a merged DataFrame.
	'''
	print(f"Loading product dfs for category '{category_name}'")
	product_dfs = get_category_product_dfs(category_name, index_categories, category_products)
	print("")
	print(f"Merging {len(product_dfs)} product dfs")
	merged_df = merge_category_dfs(product_dfs)
	print("")
	print(f"Done! Merged {len(merged_df)} rows")
	return merged_df

# category = "PERSONAL_COMPUTER"
category = "VIDEO_CARD"
merged_df = get_category_product_dfs_merged(category, index_categories, category_products)

merged_df.head()

In [None]:
# Print df info
print(merged_df.info())

In [None]:
def get_aggregate_timeseries_df(merged_df: pd.DataFrame) -> pd.DataFrame:
	'''
		Aggregates the merged DataFrame to get a DataFrame mean values for a specific column.
	'''
	# remove asin and category columns
	merged_df = merged_df.drop(columns=["asin", "category"])
	# df_ts = merged_df.groupby("time").mean()
	# NEW, USED, AMAZON, EBAY_NEW_SHIPPING, EBAY_USED_SHIPPING by mean
	# COUNT_NEW, COUNT_USED by sum
	df_ts = merged_df.groupby("time").agg({"NEW": "mean",
																					"USED": "mean",
																					"AMAZON": "mean",
																					"EBAY_NEW_SHIPPING": "mean",
																					"EBAY_USED_SHIPPING": "mean",
																					"COUNT_NEW": "sum",
																					"COUNT_USED": "sum"})

	# linear interpolation for NaN values
	df_ts.interpolate(method="time", inplace=True)
	
	# window = 90
	# # # smooth the data by taking the rolling mean with a window of 20
	# resample_size = 1 # 7 # 1
	# # df_ts = df_ts.rolling(window=int((30 * 3) / resample_size)).mean()
	# # df_ts = df_ts.resample("Q").mean()
	# # df_ts = df_ts.resample("W").mean()
	# # only resample COUNTS to weekly?
	# df_ts["NEW"] = df_ts["NEW"].rolling(window=int(window / resample_size)).mean()
	# df_ts["USED"] = df_ts["USED"].rolling(window=int(window / resample_size)).mean()
	# df_ts["AMAZON"] = df_ts["AMAZON"].rolling(window=int(window / resample_size)).mean()
	# df_ts["EBAY_NEW_SHIPPING"] = df_ts["EBAY_NEW_SHIPPING"].rolling(window=int(window / resample_size)).mean()
	# df_ts["EBAY_USED_SHIPPING"] = df_ts["EBAY_USED_SHIPPING"].rolling(window=int(window / resample_size)).mean()
	# df_ts["COUNT_NEW"] = df_ts["COUNT_NEW"].rolling(window=int(window / resample_size)).sum()
	# df_ts["COUNT_USED"] = df_ts["COUNT_USED"].rolling(window=int(window / resample_size)).sum()

	# use np.convolve to get the rolling average for all columns
	window = 30 * 3
	avg = np.ones(window) / window
	values_avg = np.convolve(df_ts["NEW"].values, avg, mode="same")
	df_ts["NEW"] = values_avg
	values_avg = np.convolve(df_ts["USED"].values, avg, mode="same")
	df_ts["USED"] = values_avg
	values_avg = np.convolve(df_ts["AMAZON"].values, avg, mode="same")
	df_ts["AMAZON"] = values_avg
	values_avg = np.convolve(df_ts["EBAY_NEW_SHIPPING"].values, avg, mode="same")
	df_ts["EBAY_NEW_SHIPPING"] = values_avg
	values_avg = np.convolve(df_ts["EBAY_USED_SHIPPING"].values, avg, mode="same")
	df_ts["EBAY_USED_SHIPPING"] = values_avg
	# values_avg = np.convolve(df_ts["COUNT_NEW"].values, avg, mode="same")
	# df_ts["COUNT_NEW"] = values_avg
	# values_avg = np.convolve(df_ts["COUNT_USED"].values, avg, mode="same")
	# df_ts["COUNT_USED"] = values_avg

	# Smooth again if not using rolling
	window = 30
	avg = np.ones(window) / window
	values_avg = np.convolve(df_ts["NEW"].values, avg, mode="same")
	df_ts["NEW"] = values_avg
	values_avg = np.convolve(df_ts["USED"].values, avg, mode="same")
	df_ts["USED"] = values_avg
	values_avg = np.convolve(df_ts["AMAZON"].values, avg, mode="same")
	df_ts["AMAZON"] = values_avg
	values_avg = np.convolve(df_ts["EBAY_NEW_SHIPPING"].values, avg, mode="same")
	df_ts["EBAY_NEW_SHIPPING"] = values_avg
	values_avg = np.convolve(df_ts["EBAY_USED_SHIPPING"].values, avg, mode="same")
	df_ts["EBAY_USED_SHIPPING"] = values_avg
	values_avg = np.convolve(df_ts["COUNT_NEW"].values, avg, mode="same")
	df_ts["COUNT_NEW"] = values_avg
	values_avg = np.convolve(df_ts["COUNT_USED"].values, avg, mode="same")
	df_ts["COUNT_USED"] = values_avg
		

	# resample COUNT_NEW and COUNT_USED to weekly frequency with sum
	# df_ts["COUNT_NEW"] = df_ts["COUNT_NEW"].resample("W").sum()
	# df_ts["COUNT_USED"] = df_ts["COUNT_USED"].resample("W").sum()

	# remove all rows before 2019-01-01
	# df_ts = df_ts[df_ts.index >= "2019-01-01"]
	df_ts = df_ts[df_ts.index >= "2017-01-01"]
	# remove all after 2024-01-01
	# df_ts = df_ts[df_ts.index <= "2024-01-01"]
	# df_ts = df_ts[df_ts.index <= "2023-11-18"]
	df_ts = df_ts[df_ts.index <= "2023-10-01"]
	return df_ts

def sample_products(merged_df: pd.DataFrame, sample_size: int) -> pd.DataFrame:
	'''
		Samples a number of products from the merged DataFrame.
	'''
	# sample products
	sample_asins = merged_df["asin"].sample(sample_size)
	# filter merged_df
	sample_df = merged_df[merged_df["asin"].isin(sample_asins)]
	return sample_df

def sample_products_with_most_non_nan_values(merged_df: pd.DataFrame, columns: list[str], sample_size: int) -> pd.DataFrame:
	'''
		Samples a number of products from the merged DataFrame based on the number of non-NaN values in a list of columns.
	'''
	# get unique products
	unique_asins = merged_df["asin"].unique()
	non_nan_counts = {asin: 0 for asin in unique_asins}
	# get number of non-NaN values for each product for each column
	# for i, asin in enumerate(unique_asins):
	# 	print(f"{i+1}/{len(unique_asins)} ({asin})      ", end="\r")
	# 	# for column in columns:
	# 	# 	non_nan_counts[asin] += merged_df[merged_df["asin"] == asin][column].count()
	# 	# do this in a single line
	# 	non_nan_counts[asin] = merged_df[merged_df["asin"] == asin][columns].count().sum()
	
	# quicker
	# for column in columns:
	# 	non_nan_counts = merged_df[merged_df[column].notna()]["asin"].value_counts().to_dict()
	
	# even quicker
	non_nan_counts = merged_df[merged_df[columns].notna().all(axis=1)]["asin"].value_counts().to_dict()

	# sort products by non-NaN values
	sorted_asins = sorted(non_nan_counts, key=non_nan_counts.get, reverse=True)
	# sample the top products
	sample_asins = sorted_asins[:sample_size]
	# sample products
	sample_df = merged_df[merged_df["asin"].isin(sample_asins)]
	return sample_df
	a = 0

# df_ts = get_aggregate_timeseries_df(merged_df)
total_products = len(merged_df["asin"].unique())
# sample_size = 500
# sample_size = total_products
sample_size = 100000 # 100
# sampled_df = sample_products(merged_df, sample_size)
sampled_df = sample_products_with_most_non_nan_values(merged_df, ["NEW", "USED", "AMAZON"], sample_size)

df_ts = get_aggregate_timeseries_df(sampled_df)

df_ts.head()

In [None]:
def get_fig(df: pd.DataFrame, title: str) -> go.Figure:
	'''
		Returns a plotly figure for a DataFrame.
	'''
	# first row are prices / indices, second row are counts
	fig = make_subplots(rows=2, cols=1, row_heights=[0.8, 0.2], shared_xaxes=True, vertical_spacing=0.02)
	colors = px.colors.qualitative.Plotly
	first_row_columns = ["NEW", "USED", "AMAZON", "EBAY_NEW_SHIPPING", "EBAY_USED_SHIPPING"]
	second_row_columns = ["COUNT_NEW", "COUNT_USED"]
	# add traces
	for i, column in enumerate(first_row_columns):
		fig.add_trace(go.Scatter(x=df.index, y=df[column], mode="lines", name=column, line=dict(color=colors[i])), row=1, col=1)
	df_counts = df[second_row_columns]
	# resample to weekly frequency with sum
	# df_counts = df_counts.resample("W").mean()
	for i, column in enumerate(second_row_columns):
		fig.add_trace(go.Bar(x=df_counts.index, y=df_counts[column], name=column, marker=dict(color=colors[i])), row=2, col=1)
	# lower bar spacing
	fig.update_layout(bargap=0)
	# remove bar borders
	fig.update_traces(marker_line_width=0, row=2, col=1)
	# TODO: convert to histogram
	# bottom = px.histogram(df, x=df.index, y=second_row_columns, histfunc="sum")
	# fig.add_trace(bottom, row=2, col=1)
	# change barmode to stacked
	fig.update_layout(barmode="stack")
	fig.update_xaxes(showgrid=True)
	# add "COUNT_REVIEWS" to second row with 6th color
	# fig.add_trace(go.Scatter(x=df.index, y=df["COUNT_REVIEWS"], mode="lines", name="COUNT_REVIEWS", line=dict(color=colors[5])), row=2, col=1)
	# update layout
	fig.update_layout(title=title, xaxis_title="Time", yaxis_title="Price")
	fig.update_xaxes(title_text="Time", row=2, col=1)
	fig.update_yaxes(title_text="Price trends", row=1, col=1)
	fig.update_yaxes(title_text="Count listings", row=2, col=1)
	# make 1440x720
	fig.update_layout(width=1280, height=720)
	fig.update_layout(legend=dict(traceorder="normal"))
	# remove first row x-axis title
	fig.update_xaxes(title_text="", row=1, col=1)
	# add events
	for event in events:
		fig = add_vline_annotation(fig, event, textangle=-20)
	# set margin to {"r": 0, "t": 60, "b": 0, "l": 0}
	fig.update_layout(margin=dict(r=0, t=60, b=0, l=0))
	return fig

df_ts_copy = df_ts.copy()
# df_ts_copy = df_ts_copy.resample("M").mean()
	

# fig = get_fig(df_ts_copy, f"Aggregate price history of category '{category}' using a sample of {sample_size}/{total_products} products")
title = f"Aggregate product price history trends for category '{category}' for domain '{domain_name_map[domain]}' ({total_products} products)"
fig = get_fig(df_ts_copy, title)
fig.show()

In [None]:
# # Save figure - only do this if you turn off smoothing in get_aggregate_timeseries_df()
# filepath = os.path.join(path_visualizations_root, f"aggregate-price-history-{category}-domain-{domain}-unpolished.png")
# fig.write_image(filepath, scale=3)

In [None]:
# # Save figure - only do this if you turn on smoothing in get_aggregate_timeseries_df()
# filepath = os.path.join(path_visualizations_root, f"aggregate-price-history-{category}-domain-{domain}-polished.png")
# fig.write_image(filepath, scale=3)

In [None]:
import pandas as pd

# Example DataFrame
data = {
    'values': [0.5, 0.8, 1.2, 1.5, 1.7],
    'dates': ['2017-01-01', '2019-01-03', '2019-01-04', '2019-01-01', '2019-01-05']
}

df = pd.DataFrame(data)
df['dates'] = pd.to_datetime(df['dates'])
df.set_index('dates', inplace=True)
# reorder index
df = df.sort_index()

dates_epoch = pd.to_datetime(df.index).astype(int) / 10**9


# Fixed date
fixed_date = '2019-01-02'
fixed_date_epoch = pd.to_datetime(fixed_date).timestamp()

# Find closest index to fixed date
closest_index = dates_epoch - fixed_date_epoch
# convert to numpy array
closest_index = closest_index.to_numpy()
closest_index = np.abs(closest_index).argmin()

# Value at closest index (just the number)
date_at_index = df.index[closest_index]

# Value at closest index (values column)
value = df.values[closest_index][0]

# Difference in days
difference = (df.index[closest_index] - pd.to_datetime(fixed_date)).days

# print("Closest index to", fixed_date, ":", closest_index)
# print("Closest value to", fixed_date, ":", value)
# print("Difference in days:", difference)

print(f"Closest index to '{fixed_date}': '{closest_index}'")
print(f"Date at index to '{fixed_date}': '{date_at_index}'")
print(f"Difference in days: '{difference}'")
print(f"Value at index to '{fixed_date}': '{value}'")