In [None]:
import os
import sys
import json
import time
import numpy as np
import numpy.typing as npt
from datetime import datetime
from plotly import graph_objects as go
from plotly.subplots import make_subplots
import plotly.colors as colors
from bson import json_util
import copy

# Project imports
sys.path.append(os.getcwd())
# from src.py.utils.generic_utils import wrapper
from src.py.scraping.keepa.keepa_analysis_utils import load_result_object, parse_csv, organize_csv, discretize_csv_smart, get_trends, get_timeseries_from_trends, remove_outliers, remove_outliers_csv

In [None]:
# path_domains_root = "data/keepa/products/domains"
domain = "1" # chosen domain
# path_domains = os.path.join(path_domains_root, domain)
path_categories_json = f"data/keepa/generated/categories-domain-{domain}.json" # file containing products grouped by category
path_trends = f"data/keepa/generated/trends" # root folder containing trends

In [None]:
categories = json.load(open(path_categories_json, "r"))
print(f"Number of categories: {len(categories)}")
for category in categories:
	print(f"* {category} ({len(categories[category])} products)")

In [None]:
# Test with COMPUTER_PROCESSOR category
category = "INTERNAL_SOLID_STATE_DRIVES"
trends = {}
time_start = time.time()
skipped_product_keys_dict = {}
for i, asin in enumerate(categories[category]):
	filepath = categories[category][asin]["file"]
	result_object = load_result_object(filepath)
	product = result_object["products"][0]
	print(
	    f"Product {i + 1}/{len(categories[category])} ({asin}): {product['title']}\r",
	    end="")
	parsed_csv = parse_csv(product["csv"])
	organized_csv = organize_csv(parsed_csv)
	# csv_no_outliers = organized_csv
	csv_no_outliers = remove_outliers_csv(organized_csv, max_std_multiplier=3) # =2)
	discretize_csv_smart(csv_no_outliers)
	trends, skipped_product_keys = get_trends(csv_no_outliers, trends, minimum_datapoints=100)
	for k in skipped_product_keys:
		if k not in skipped_product_keys_dict:
			skipped_product_keys_dict[k] = []
		skipped_product_keys_dict[k].append(filepath)
	# TODO: record total keys for product and keys skipped then calculate percentage
	# at the end calculate the number and percentage where all keys were skipped
time_end = time.time()
print()
print(f"Elapsed time: {round(time_end - time_start, 2)} seconds")

In [None]:
# Print number of skipped products for each key
print("Number of skipped products for each key:")
for key in skipped_product_keys_dict:
	print(f"* {key}: {len(skipped_product_keys_dict[key])}")

In [None]:
# Print number of days for each key
print("Number of days for each key:")
for key in trends:
	print(f"* {key}: {len(trends[key])}")

In [None]:
# TODO: what are some of these keys even?

In [None]:
selected_trend_key = "NEW"  # "NEW # "AMAZON"
amazon_values, amazon_dates = get_timeseries_from_trends(trends,
                                                         selected_trend_key,
                                                         operation="average")
amazon_counts, _ = get_timeseries_from_trends(trends,
                       selected_trend_key,
                       operation="count")
print(f"Got {len(amazon_values)} values for {selected_trend_key}")

In [None]:
# moving average window 30 days:
window = 30 * 1 # 30
avg30 = np.ones(window) / window
amazon_values_avg30 = np.convolve(amazon_values, avg30, mode='same')

# cut off all pairs of dates and values where date is < 2019
cutoff_date = datetime(2015, 1, 1) # datetime(2015, 1, 1) # datetime(2019, 1, 1)
# cutoff_date = datetime(2015, 1, 1)
dates_cutoff_index = 0
try:
	dates_cutoff_index = np.where(amazon_dates == cutoff_date)[0][0]
except:
	pass
amazon_values_cut = amazon_values[dates_cutoff_index:]
amazon_dates_cut = amazon_dates[dates_cutoff_index:]
amazon_counts_cut = amazon_counts[dates_cutoff_index:]
# avg30
amazon_dates_avg30 = amazon_dates_cut
amazon_values_avg30 = amazon_values_avg30[dates_cutoff_index:]

# remove last 30 days from timeseries
amazon_dates_cut = amazon_dates_cut[:-30]
amazon_values_cut = amazon_values_cut[:-30]
amazon_counts_cut = amazon_counts_cut[:-30]
# avg30
amazon_dates_avg30 = amazon_dates_avg30[:-30]
amazon_values_avg30 = amazon_values_avg30[:-30]



# Plot timeseries as 2 subplots - top is timeseries, bottom is count for a specific date - they are linked
top_row_height = 0.7
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.0125, row_heights=[top_row_height, 1 - top_row_height])

# top plot
fig.add_trace(go.Scatter(x=amazon_dates_cut, y=amazon_values_cut, name="Average price (USD)"), row=1, col=1)
# bottom plot
fig.add_trace(go.Scatter(x=amazon_dates_cut, y=amazon_counts_cut, name="Count (data points for day)"), row=2, col=1)

# Add moving average in gray line to top plot
fig.add_trace(go.Scatter(x=amazon_dates_avg30,
                         y=amazon_values_avg30,
                         line=dict(color='orange', width=3),
												 name=f"Avg. price ({window} day moving avg.)"),
              row=1,
              col=1,
							)

# Give title to x axis
fig.update_xaxes(title_text="Date", row=2, col=1)

# Give title to y axis of top plot
fig.update_yaxes(title_text="Average price (USD)", row=1, col=1)
# Give title to y axis of bottom plot
fig.update_yaxes(title_text="Count (data points for day)", row=2, col=1)

fig.update_layout(title=f"Price trends of {len(categories[category]) - len(skipped_product_keys_dict[selected_trend_key])} products in category \"{category}\" for \"{selected_trend_key}\" trend key")
# make it wide
fig.update_layout(width=1500, height=700)
fig.show(renderer="browser")
# fig.show()


# Conclusions

* is nice xd

# TODO

* Which std dev multiplier to use for removing extreme values? Current default is 2x std dev - should we use 2x, 3x, 5x or 10x?
* When adding to trends, create a way to specify minimum requirements for a number of data points for a specific trend key in the product object for it to be added to the trends. Default is -1 (no minimum), recommended is 100.

# DONE

* Remove extreme values from timeseries before adding to trends - get average of all values and remove value,date pairs that are too far from the average (e.g. 3x the average +/-)

In [None]:
def plot_trends(traces: list[dict],
                title: str,
                currency: str = "USD") -> go.Figure:
	'''
		Plots a list of timeseries with the given title
	'''
	# Example element in traces:
	# trace = {
	# 	"x": "timeseries_dates",
	# 	"y": "timeseries_values",
	# 	"counts": "timeseries_counts",
	# 	"avg30": {
	# 		"x": "timeseries_dates",
	# 		"y": "timeseries_values",
	# 	},
	# 	"name": "Average price, ..."
	# 	"trend_key": "NEW",
	# 	"operation": "average",
	# }

	rows = 2
	rows_added = 0

	# Remove traces for RATING and COUNT_REVIEWS (they are plotted separately - below)
	trace_rating = {}
	trace_reviews = {}
	traces_new = []
	for trace in traces:
		if "RATING" in trace["trend_key"]:
			trace_rating = trace
			rows += 1
			rows_added += 1
			continue
		if "COUNT_REVIEWS" in trace["trend_key"]:
			trace_reviews = trace
			rows += 1
			rows_added += 1
			continue
		traces_new.append(trace)
	traces = traces_new

	top_row_height = 0.7
	fig = make_subplots(
	    cols=1,
	    #  rows=len(traces) + 1, # main (top) plot + count plot for each trend
	    rows=rows,
	    shared_xaxes=True,
	    vertical_spacing=0.0125,
	    #  row_heights=[top_row_height] + [1 - top_row_height] * len(traces),
	    # row_heights=[top_row_height, 1 - top_row_height],
	    row_heights=[top_row_height] + [1 - top_row_height] * (rows - 1),
	)
	opacity=0.25
	# top plot
	# for i, trace in enumerate(traces):
	# 	fig.add_trace(go.Scatter(x=trace["x"],
	# 													 y=trace["y"],
	# 													 name=trace["name"],
	# 													 line=dict(width=1, color=colors.qualitative.Plotly[i]),
	# 													 opacity=opacity, # 0.5
	# 													 ),
	# 								row=1,
	# 								col=1,
	# 	)
	# Add moving average line to top plot (random color)
	for i, trace in enumerate(traces):
		fig.add_trace(
		    go.Scatter(
		        x=trace["avg30"]["x"],
		        y=trace["avg30"]["y"],
		        name=f"Avg{window} ({trace['name']})",
		        line=dict(width=3, color=colors.qualitative.Plotly[i]),
		    ),
		    row=1,
		    col=1,
		)
	# bottom plots (count)
	for i, trace in enumerate(traces):
		fig.add_trace(
		    go.Scatter(
		        x=trace["x"],
		        y=trace["counts"],
		        name=f"Count ({trace['name']})",
		        line=dict(width=1, color=colors.qualitative.Plotly[i]),
		    ),
		    # row=i + 2,
		    row=2,
		    col=1,
		)
		if i == len(
		    traces
		) - rows_added + 1:  # remove this if you want to plot all counts, not just prices
			break
	rows_plotted = 0  # extra rows plotted
	# reviews
	if trace_reviews != {}:
		fig.add_trace(
		    go.Scatter(
		        # x=trace_reviews["x"],
		        # y=trace_reviews["y"],
		        x=trace_reviews["avg30"]["x"],
						y=trace_reviews["avg30"]["y"],
		        name=f"Reviews ({trace_reviews['name']})",
		        line=dict(width=1, color=colors.qualitative.Plotly[0]),
		    ),
		    row=rows - rows_added + rows_plotted + 1,
		    col=1,
		)
		# add y axis title
		fig.update_yaxes(title_text=f"Avg. reviews per day",
		                 row=rows - rows_added + rows_plotted + 1,
		                 col=1)
		rows_plotted += 1
	# rating
	if trace_rating != {}:
		fig.add_trace(
		    go.Scatter(
		        # x=trace_rating["x"],
		        # y=trace_rating["y"],
		        x=trace_rating["avg30"]["x"],
		        y=trace_rating["avg30"]["y"],
		        name=f"Rating ({trace_rating['name']})",
		        line=dict(width=1, color=colors.qualitative.Plotly[1]),
		    ),
		    row=rows - rows_added + rows_plotted + 1,
		    col=1,
		)
		# add y axis title
		fig.update_yaxes(title_text=f"Average rating",
		                 row=rows - rows_added + rows_plotted + 1,
		                 col=1)
		rows_plotted += 1
	# Give title to x axis
	# fig.update_xaxes(title_text="Date", row=len(traces) + 1, col=1)
	fig.update_xaxes(title_text="Date", row=rows, col=1)
	# Give title to y axis of top plot
	fig.update_yaxes(title_text=f"Average price ({currency})", row=1, col=1)
	# Give title to y axis of bottom plots
	for i, trace in enumerate(traces):
		# fig.update_yaxes(title_text=f"Count ({trace['name']})", row=i + 2, col=1)
		fig.update_yaxes(title_text=f"Data points per day", row=i + 2, col=1)
		break
	fig.update_layout(title=title)
	# Scale it as width=1500, height=700+100*len(traces)
	# fig.update_layout(width=1500, height=900 + 100 * len(traces))
	fig.update_layout(width=1500, height=900 + (rows - 2) * 100)
	return fig


def get_trend_traces(
    trends: dict[str, dict[np.datetime64, npt.NDArray[np.float64]]],
    trend_keys: list[str],
    operations: list[str],
    window: int = 30 * 1,
    cutoff_date: datetime = datetime(2019, 1, 1)
) -> list[dict]:
	'''
		Gets a list of traces to plot from the given trends dictionary and settings
	'''
	traces = []
	for trend_key, operation in zip(trend_keys, operations):
		values, dates = get_timeseries_from_trends(trends,
		                                           trend_key,
		                                           operation=operation)
		counts, _ = get_timeseries_from_trends(trends,
		                                       trend_key,
		                                       operation="count")
		# moving average window 30 days:
		avg30 = np.ones(window) / window
		values_avg30 = np.convolve(values, avg30, mode='same')
		# cut off all pairs of dates and values where date is < 2019
		dates_cutoff_index = 0
		# try:
		# 	dates_cutoff_index = np.where(dates == cutoff_date)[0][0]
		# except:
		# 	pass
		for i, date in enumerate(dates):
			if date >= cutoff_date:
				dates_cutoff_index = i
				break
		values_cut = values[dates_cutoff_index:]
		dates_cut = dates[dates_cutoff_index:]
		counts_cut = counts[dates_cutoff_index:]
		# avg30
		dates_avg30 = dates_cut
		values_avg30 = values_avg30[dates_cutoff_index:]
		# remove last {window} days from timeseries
		dates_cut = dates_cut[:-window]
		values_cut = values_cut[:-window]
		# avg30
		dates_avg30 = dates_avg30[:-window]
		values_avg30 = values_avg30[:-window]
		# add trace
		traces.append({
		    "x": dates_cut,
		    "y": values_cut,
		    "counts": counts_cut,
		    "avg30": {
		        "x": dates_avg30,
		        "y": values_avg30,
		    },
		    "name": f"{trend_key} ({operation})",
		    "trend_key": trend_key,
		    "operation": operation,
		})
	return traces

# currency_map = {
# 	"1": "USD", # TODO: add others
# }
# domain = "1"
# # category = "VIDEO_CARD"
# title = f"Trends of {len(categories[category])} products for category \"{category}\""
# currency = currency_map[domain]
# window = 30 * 6 # 30 days # sometimes good to set as 30 * 6 (6 months) for smoother trends
# cutoff_date = datetime(2017, 1, 7) # datetime(2019, 1, 7)
# trend_keys_operations = [
# 	("NEW", "average"),
# 	("USED", "average"),
# 	("AMAZON", "average"),
# 	("EBAY_NEW_SHIPPING", "average"),
# 	("EBAY_USED_SHIPPING", "average"),
# 	("RATING", "average"),
# 	("COUNT_REVIEWS", "average"),
# ]
# # trend_keys = [
# # 	"NEW",
# # 	"USED",
# # 	"AMAZON",
# # 	"EBAY_NEW_SHIPPING",
# # 	"EBAY_USED_SHIPPING",
# # ]
# # operations = [
# # 	"average",
# # 	"average",
# # 	"average",
# # 	"average",
# # 	"average",
# # ]
# trend_keys = [t[0] for t in trend_keys_operations]
# operations = [t[1] for t in trend_keys_operations]
# trend_traces = get_trend_traces(trends, trend_keys, operations, window=window, cutoff_date=cutoff_date)
# fig = plot_trends(trend_traces, title, currency)
# fig.show(renderer="browser")

In [None]:
# Calculate trends for all categories and save to file (for each category) (for selected domain)
# so it won't have to be calculated again (unless the data or algorithm changes)

In [None]:
def get_trends_for_category(category: dict) -> dict:
	'''
		Gets trends for a category
	'''
	trends = {}
	skipped_product_keys_dict = {}
	time_start = time.time()
	for i, asin in enumerate(category):
		filepath = category[asin]["file"]
		result_object = load_result_object(filepath)
		product = result_object["products"][0]
		print(f"Product {i + 1}/{len(category)} ({asin}): {product['title']}\r",
		      end="")
		parsed_csv = parse_csv(product["csv"])
		organized_csv = organize_csv(parsed_csv)
		# csv_no_outliers = organized_csv
		csv_no_outliers = remove_outliers_csv(organized_csv, max_std_multiplier=3)
		discretize_csv_smart(csv_no_outliers)
		trends, skipped_product_keys = get_trends(csv_no_outliers,
		                                          trends,
		                                          minimum_datapoints=100)
		for k in skipped_product_keys:
			if k not in skipped_product_keys_dict:
				skipped_product_keys_dict[k] = []
			skipped_product_keys_dict[k].append(filepath)
		# TODO: record total keys for product and keys skipped then calculate percentage
		# at the end calculate the number and percentage where all keys were skipped
	print()
	time_end = time.time()
	time_elapsed = round(time_end - time_start, 2)
	# print(f"Elapsed time: {time_elapsed} seconds")
	return_dict = {
	    "trends": trends,
	    "skipped_product_keys_dict": skipped_product_keys_dict,
	    "time_elapsed": time_elapsed,
	}
	return return_dict


def get_categories_object(categories_path) -> dict:
	'''
		Gets categories object from a file
	'''
	categories = {}
	with open(categories_path, "r") as f:
		categories = json.load(f)
	return categories


def save_trends_result(return_dict: dict, path_trends: str) -> None:
	# if directory portion of the path doesn't exist, create it
	if not os.path.exists(os.path.dirname(path_trends)):
		os.makedirs(os.path.dirname(path_trends))
	# # safe_string = json_util.dumps(return_dict, indent=2) # using bson.json_util.dumps to handle datetime objects
	# # save trends to json file
	# # Nothing will serialize the object, as the unfamiliar object (datetime) is used as keys in the dictionary.
	# safe_return_dict = {}
	# # deep copy the return_dict
	# safe_return_dict = copy.deepcopy(return_dict)
	# for trend_key in return_dict["trends"]:
	# 	for date in return_dict["trends"][trend_key]:
	# 		safe_date = str(date)
	# 		safe_return_dict["trends"][trend_key][safe_date] = safe_return_dict[
	# 		    "trends"][trend_key][date]
	# 		del safe_return_dict["trends"][trend_key][date]
	# safe_string = json_util.dumps(safe_return_dict, indent=2)
	# with open(path_trends, "w") as f:
	# 	f.write(safe_string)
	with open(path_trends, "wb") as f:
		np.save(f, return_dict) # type: ignore
	a = 0


def process_trends_for_domain(domain: str) -> dict:
	'''
		Processes trends for all categories in a domain
	'''
	path_categories_json = f"data/keepa/generated/categories-domain-{domain}.json"  # file containing products grouped by category
	path_trends = f"data/keepa/generated/trends"  # root folder containing trends
	categories = get_categories_object(path_categories_json)
	domain_trends = {}
	skipped_product_keys_dict = {}
	time_elapsed = 0
	for i, category in enumerate(categories):
		print(
		    f"Processing category \"{category}\" ({len(categories[category])} products) ({i + 1}/{len(categories)})"
		)
		filepath = os.path.join(*[path_trends, domain, f"{category}.npy"])
		if os.path.exists(filepath):
			print(
			    f"Skipping category \"{category}\" - already exists at \"{filepath}\" - loading..."
			)
			return_dict = np.load(filepath, allow_pickle=True).item()
			domain_trends[category] = return_dict["trends"]
			skipped_product_keys_dict[category] = return_dict[
			    "skipped_product_keys_dict"]
			time_elapsed += return_dict["time_elapsed"]
			print(
			    f"Elapsed time (loaded) for category \"{category}\": {return_dict['time_elapsed']} seconds"
			)
			print()
			continue
		# if len(categories[category]) > 200: # debug
		# 	continue
		return_dict = get_trends_for_category(categories[category])
		save_trends_result(return_dict, filepath)
		domain_trends[category] = return_dict["trends"]
		skipped_product_keys_dict[category] = return_dict[
		    "skipped_product_keys_dict"]
		time_elapsed += return_dict["time_elapsed"]
		print(
		    f"Elapsed time for category \"{category}\": {return_dict['time_elapsed']} seconds"
		)
		print()
	print(f"Total elapsed time: {time_elapsed} seconds")
	return_dict = {
	    "trends": domain_trends,
	    "skipped_product_keys_dict": skipped_product_keys_dict,
	    "time_elapsed": time_elapsed,
	}
	# Save trends for domain to file
	save_trends_result(
	    return_dict,
	    os.path.join(*[path_trends, f"all-trends-domain-{domain}.npy"]))
	return return_dict

# Uncomment when debugging:
# domain = "1" # chosen domain (US)
# domain_trends_results = process_trends_for_domain(domain)


In [None]:
trends_all = domain_trends_results["trends"]

In [None]:
# Test load of numpy file
test_category = np.load(
    "data/keepa/generated/trends/1/EXTERNAL_SOLID_STATE_DRIVES.npy",
    allow_pickle=True).item()
print(f"Trend keys:")
for trend_key in test_category["trends"]:
	print(f"* {trend_key}")
# looks good...

In [None]:
domain = "1" # chosen domain (US)
domain_trends_results = process_trends_for_domain(domain)

In [None]:
all_trends = domain_trends_results["trends"]
categories = get_categories_object(
    f"data/keepa/generated/categories-domain-{domain}.json")
print(f"Number of categories: {len(categories)}")
for category in categories:
	print(f"* {category} ({len(categories[category])} products)")

In [None]:
currency_map = {
    "1": "USD",  # TODO: add others
}
domain_map = {
  "1": "US",
}
# domain = "1" # defined above when loading / processing trends
category = "VIDEO_CARD"
trends = all_trends[category]
title = f"Trends of {len(categories[category])} products for category \"{category}\" (Amazon {domain_map[domain]})"
currency = currency_map[domain]
window = 30 * 3 # 30 days # sometimes good to set as 30 * 6 (6 months) for smoother trends
cutoff_date = datetime(2017, 1, 7) # datetime(2019, 1, 7)
trend_keys_operations = [
 ("NEW", "average"),
 ("USED", "average"),
 ("AMAZON", "average"),
 ("EBAY_NEW_SHIPPING", "average"),
 ("EBAY_USED_SHIPPING", "average"),
 ("RATING", "average"),
 ("COUNT_REVIEWS", "average"),
]
# trend_keys = [
# 	"NEW",
# 	"USED",
# 	"AMAZON",
# 	"EBAY_NEW_SHIPPING",
# 	"EBAY_USED_SHIPPING",
# ]
# operations = [
# 	"average",
# 	"average",
# 	"average",
# 	"average",
# 	"average",
# ]
trend_keys = [t[0] for t in trend_keys_operations]
operations = [t[1] for t in trend_keys_operations]
trend_traces = get_trend_traces(trends, trend_keys, operations, window=window, cutoff_date=cutoff_date)
fig = plot_trends(trend_traces, title, currency)
fig.show(renderer="browser")
# fig.show()

In [None]:
# Save all for a specific setting to files (this is a self-contained block)
domain = "1" # chosen domain (US)
domain_trends_results = process_trends_for_domain(domain)
all_trends = domain_trends_results["trends"]
categories = get_categories_object(
    f"data/keepa/generated/categories-domain-{domain}.json")
# print(f"Number of categories: {len(categories)}")
# for category in categories:
# 	print(f"* {category} ({len(categories[category])} products)")
currency_map = {
			"1": "USD",  # TODO: add others
}
domain_map = {
	"1": "US",
}
trend_keys_operations = [
	("NEW", "average"),
	("USED", "average"),
	("AMAZON", "average"),
	("EBAY_NEW_SHIPPING", "average"),
	("EBAY_USED_SHIPPING", "average"),
	("RATING", "average"),
	("COUNT_REVIEWS", "average"),
	]
trend_keys = [t[0] for t in trend_keys_operations]
operations = [t[1] for t in trend_keys_operations]
currency = currency_map[domain]
window = 30 * 3 # 30 days # sometimes good to set as 30 * 6 (6 months) for smoother trends ; 30 * 3 is smoothing by quarter
cutoff_date = datetime(2017, 1, 7) # datetime(2019, 1, 7)
print(f"\nGenerating plots for all categories...\n")
for i, category in enumerate(categories):
	try:
		print(f"Plotting \"{category}\" ({len(categories[category])} products) ({i + 1}/{len(categories)})")
		trends = all_trends[category]
		title = f"Trends of {len(categories[category])} products for category \"{category}\" (Amazon {domain_map[domain]})"
		trend_traces = get_trend_traces(trends, trend_keys, operations, window=window, cutoff_date=cutoff_date)
		fig = plot_trends(trend_traces, title, currency)
		fig.show(renderer="browser")
		plot_path = f"data/keepa/generated/plots/{domain}/{category}.html"
		if not os.path.exists(os.path.dirname(plot_path)):
			os.makedirs(os.path.dirname(plot_path))
		print(f"Saving html...")
		fig.write_html(plot_path)
		print(f"Saving png...")
		fig.write_image(plot_path.replace(".html", ".png"), scale=2)
		print(f"Saved!")
		print()
	except Exception as e:
		print(f"Error plotting \"{category}\" ({len(categories[category])} products) ({i + 1}/{len(categories)})")
		print(e)
		print()
print("ALL DONE!")