In [None]:
import os
import json
from keepa.interface import keepa_minutes_to_time, parse_csv
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from typing import Tuple
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.colors as colors


In [None]:
path_sample_product = "data/keepa/products/domains/1/B0B7CPSN2K.json"
path_visualizations_root = "data/keepa/generated/visualizations"

In [None]:
result_object = json.load(open(path_sample_product, "r"))
product = result_object["products"][0]
print(f"ASIN: {product['asin']}, Title: {product['title']}")

In [None]:
parsed_csv = parse_csv(product["csv"])
print(f"Keys: {parsed_csv.keys()}")

In [None]:
df_new = parsed_csv["df_NEW"]
df_new.head()

In [None]:
# NOTE: this is the old version - look below for improved version
def organize_csv(parsed_csv: dict) -> dict:
	'''
		Organizes the keys into price / other data categories and returns an object
		Example mapping: "AMAZON": {"AMAZON": data_1, "AMAZON_time": data_2, "df_AMAZON": data_3}
	'''
	organized = {}
	for k, v in parsed_csv.items():
		root_key = k
		if k.endswith("_time"):
			root_key = k[:-5]
		if k.startswith("df_"):
			root_key = k[3:]
		if root_key not in organized:
			organized[root_key] = {}
		organized[root_key][k] = v
	return organized

def print_organized_csv(organized_csv: dict):
	print(f"Organized csv:")
	for k, v in organized_csv.items():
		print(f"{k}: {v.keys()}")

organized_csv = organize_csv(parsed_csv)
print_organized_csv(organized_csv)

In [None]:
def get_clean_date(date: datetime) -> datetime:
	'''
		Returns a copy of the datetime object, only keeping the year, month and day.
	'''
	dt = datetime.replace(date, hour=0, minute=0, second=0, microsecond=0)
	return dt

# arr_values is a numpy array of prices
def discretize(arr_values: np.array, arr_dates: np.array) -> Tuple[np.array, np.array]:
	'''
		Converts all dates in a time series to only keep the year, month and day,
		choosing the most recent consecutive date and discarding all others.
	'''
	if len(arr_values) != len(arr_dates):
		raise ValueError('Length of arr_values and arr_dates must be equal.')
	arr_dates_discrete = []
	arr_values_discrete = []
	for i in range(len(arr_values)):
		if i == len(arr_values) - 1:
			arr_dates_discrete.append(get_clean_date(arr_dates[i]))
			arr_values_discrete.append(arr_values[i])
			break
		date = get_clean_date(arr_dates[i])
		value = arr_values[i]
		date_next = get_clean_date(arr_dates[i + 1])
		if date != date_next:
			arr_dates_discrete.append(date)
			arr_values_discrete.append(value)
	return np.array(arr_values_discrete), np.array(arr_dates_discrete)

def discretize_smart_old(arr_values: np.array, arr_dates: np.array) -> Tuple[np.array, np.array]:
	'''
		Similar to discretize, but trying to fill the gaps in the time series by
		using the first data point within a day as the value to impute the previous day if missing.
		The dates must be consecutive to impute.
		The last data point within the day is used for that specific day.
	'''
	if len(arr_values) != len(arr_dates):
		raise ValueError('Length of arr_values and arr_dates must be equal.')
	arr_dates_discrete = []
	arr_values_discrete = []
	# temp_value = None # hosts the first data point within a day while the dates are the same
	# temp_date = None
	early_value = None
	early_date = None
	last_value = None
	last_date = None
	for i in range(len(arr_values)):
		date = get_clean_date(arr_dates[i])
		value = arr_values[i]
		if i == len(arr_values) - 1:
			arr_dates_discrete.append(date)
			arr_values_discrete.append(value)
			if last_date is not None:
				arr_dates_discrete.append(last_date)
				arr_values_discrete.append(last_value)
			break
		if early_date is None:
			early_date = date
			early_value = value
			continue
		if date != early_date:
			arr_dates_discrete.append(early_date)
			arr_values_discrete.append(early_value)
			if last_date is not None:
				early_date_yesterday = last_date - timedelta(days=1)
				if early_date_yesterday != arr_dates_discrete[-1]: # time check condition
					# fill the gap
					arr_dates_discrete.append(early_date_yesterday)
					arr_values_discrete.append(early_value)
				arr_dates_discrete.append(last_date)
				arr_values_discrete.append(last_value)
				last_date = None
				last_value = None
			early_date = date
			early_value = value
			continue
		if date == early_date:
			last_date = date
			last_value = value
			continue
	return np.array(arr_values_discrete), np.array(arr_dates_discrete)

def discretize_smart(arr_values: np.array, arr_dates: np.array) -> Tuple[np.array, np.array]:
	'''
		Similar to discretize, but trying to fill the gaps in the time series by
		using the first data point within a day as the value to impute the previous day if missing.
		The dates must be consecutive to impute.
		The last data point within the day is used for that specific day.
	'''
	if len(arr_values) != len(arr_dates):
		raise ValueError('Length of arr_values and arr_dates must be equal.')
	arr_dates_discrete = []
	arr_values_discrete = []
	same_date_groups = []
	# group the same dates
	for i in range(len(arr_values)):
		date = get_clean_date(arr_dates[i])
		value = arr_values[i]
		if i == 0:
			same_date_groups.append([(date, value)])
			continue
		if date == same_date_groups[-1][-1][0]:
			if len(same_date_groups[-1]) == 1:
				same_date_groups[-1].append((date, value))
				continue
			same_date_groups[-1][-1] = (date, value)
			continue
		same_date_groups.append([(date, value)])
	# impute the gaps where possible
	for group in same_date_groups:
		if len(group) == 1:
			arr_dates_discrete.append(group[0][0])
			arr_values_discrete.append(group[0][1])
			continue
		lower_date_yesterday = group[0][0] - timedelta(days=1)
		if arr_dates_discrete[-1] != lower_date_yesterday: # date check condition (can make it more strict by checking the time as well)
			arr_dates_discrete.append(lower_date_yesterday)
			arr_values_discrete.append(group[0][1])
		arr_dates_discrete.append(group[-1][0])
		arr_values_discrete.append(group[-1][1])
	return np.array(arr_values_discrete), np.array(arr_dates_discrete)

def test_discretize_smart():
	print("Test discretize_smart")
	values = np.array([1, 2, 3, 4, 5, 6])
	dates = np.array([
		datetime(2023, 11, 18, 10), # solo can be added
		datetime(2023, 11, 20, 10), # next 3 elements are the same date
		datetime(2023, 11, 20, 11), #
		datetime(2023, 11, 20, 12), # the last element is added for sure, first of the same is added if it does not overwrite the last element - OK
		datetime(2023, 11, 21, 10), # next 2 elements are the same
		datetime(2023, 11, 21, 11), # last element is added for sure, the first once can't be as it  overlaps with the last element date
	])
	# expected: [1, 2, 4, 6]
	#           [2023-11-18, 2023-11-19, 2023-11-20, 2023-11-21]
	values_discrete, dates_discrete = discretize_smart(values, dates)
	for v, d in zip(values_discrete, dates_discrete):
		print(f"{v}, {d}")

test_discretize_smart()

In [None]:

# TODO: fill small gaps (n = 5 for example) by linear interpolation between the two closest points
#       Determine how big are average gaps and how big can n be so it's still a good approximation

def fill_gaps_linear(arr_values: np.array, arr_dates: np.array, n: int) -> Tuple[np.array, np.array]:
	'''
		Fills gaps in the time series by linear interpolation between the two closest points.
		Only gaps of size n or smaller are filled.
	'''
	if len(arr_values) != len(arr_dates):
		raise ValueError('Length of arr_values and arr_dates must be equal.')
	arr_dates_filled = []
	arr_values_filled = []
	for i in range(len(arr_values)):
		date = arr_dates[i]
		value = arr_values[i]
		if i == 0:
			arr_dates_filled.append(date)
			arr_values_filled.append(value)
			continue
		days_between = (date - arr_dates_filled[-1]).days - 1
		if days_between <= n:
			# fill the gap
			# if the low point is 1 and the high point is 5, and there are 3 days missing days, we want to fill the gap with 2, 3 and 4
			low_value = arr_values_filled[-1]
			high_value = value
			for j in range(days_between):
				arr_dates_filled.append(arr_dates_filled[-1] + timedelta(days=1))
				arr_values_filled.append(low_value + (high_value - low_value) / (days_between + 1) * (j + 1))
			# append the date and value
			arr_dates_filled.append(date)
			arr_values_filled.append(value)
			continue
		arr_dates_filled.append(date)
		arr_values_filled.append(value)
	return np.array(arr_values_filled), np.array(arr_dates_filled)

def test_fill_gaps_linear():
	print("Test fill_gaps_linear")
	# we assume these are already discretized
	values = np.array([1, 5, 6])
	dates = np.array([
		datetime(2023, 11, 11),
		datetime(2023, 11, 15),
		datetime(2023, 11, 20)
	])
	# expected: [1, 2, 3, 4, 5, 6]
	#           [2023-11-11, 2023-11-12, 2023-11-13, 2023-11-14, 2023-11-15, 2023-11-20]
	values_filled, dates_filled = fill_gaps_linear(values, dates, 3)
	for v, d in zip(values_filled, dates_filled):
		print(f"{v}, {d}")


test_fill_gaps_linear()

In [None]:
# TODO: count the number of elements for a simple discretized and smart discretized time series
#       and compare the results - see how much we gain by using the smart discretization
#       Similarly compare the results for the linear interpolation

def test_counts():
	values = np.array([1, 2, 3, 4, 5, 6, 7, 8])
	dates = np.array([
		datetime(2023, 11, 11),
		datetime(2023, 11, 15),
		datetime(2023, 11, 18, 10),
		datetime(2023, 11, 20, 10),
		datetime(2023, 11, 20, 11),
		datetime(2023, 11, 20, 12),
		datetime(2023, 11, 21, 10),
		datetime(2023, 11, 21, 11)
	])
	# discretized (baseline)
	values_discrete, dates_discrete = discretize(values, dates)
	# smart discretized
	values_discrete_smart, dates_discrete_smart = discretize_smart(values, dates)
	# linear interpolation
	values_filled, dates_filled = fill_gaps_linear(values, dates, 3)
	# smart discretized + linear interpolation
	values_filled_smart, dates_filled_smart = fill_gaps_linear(values_discrete_smart, dates_discrete_smart, 3)
	print("Imputation counts:")
	print(f"  Original: {len(values)}")
	print(f"  Discretized: {len(values_discrete)}")
	print(f"      function: keeps the most recent data point for each day (cuts off the time portion of the datetime object)")
	print(f"  Smart discretized: {len(values_discrete_smart)}")
	print(f"      function: similar to discretized, but tries to fill the gaps by using the first data point within a day as the value to impute the previous day if missing")
	print(f"  Linear interpolation: {len(values_filled)}")
	print(f"      function: fills the gaps by linear interpolation between the two closest points where the gap is smaller or equal to n days")
	print(f"  Smart discretized + linear interpolation: {len(values_filled_smart)}")
	print("")
	print(f"Explanation: Direct linear interpolation adds the most points,")
	print(f"however smart discretization + linear interpolation is the best option")
	print(f"as it adds more plausible points from legitimate data before filling the gaps.")

test_counts()

In [None]:
def plot_differences(values: np.array, dates: np.array) -> go.Figure:
	'''
		Plots the differences between the original time series and the discretized time series.
	'''

	n = 2 # number of days to fill the gaps with linear interpolation
	top_row_height = 0.3

	# fig = go.Figure()
	fig = make_subplots(rows=2,
										 cols=1,
										 shared_xaxes=True,
										 vertical_spacing=0.025,
										 row_heights=[top_row_height, 1 - top_row_height],
	)

	# plot the original data
	fig.add_trace(go.Scatter(
		x=dates,
		# y=values,
		y=[4] * len(values),
		name="Original",
		mode="lines+markers"
	))

	# below is the discretized data
	values_discrete, dates_discrete = discretize(values, dates)
	fig.add_trace(go.Scatter(
		x=dates_discrete,
		# y=values_discrete,
		y=[3] * len(values_discrete),
		name="Discretized",
		mode="lines+markers"
	))

	# below is the smart discretized data
	values_discrete_smart, dates_discrete_smart = discretize_smart(values, dates)
	fig.add_trace(go.Scatter(
		x=dates_discrete_smart,
		# y=values_discrete_smart,
		y=[2] * len(values_discrete_smart),
		name="Smart discretized",
		mode="lines+markers"
	))

	# below is the linear interpolation data (need to discretize it later)
	values_filled, dates_filled = fill_gaps_linear(values_discrete, dates_discrete, n) #values, dates, n) #3)
	values_filled_discrete, dates_filled_discrete = discretize(values_filled, dates_filled)
	fig.add_trace(go.Scatter(
		x=dates_filled_discrete,
		#y=values_filled_discrete,
		y=[1] * len(values_filled_discrete),
		name=f"Linear interpolation (n = {n})",
		mode="lines+markers"
	))

	# below is the smart discretized + linear interpolation data
	values_filled_smart, dates_filled_smart = fill_gaps_linear(values_discrete_smart, dates_discrete_smart, n)
	fig.add_trace(go.Scatter(
		x=dates_filled_smart,
		#y=values_filled_smart,
		y=[0] * len(values_filled_smart),
		name=f"Smart + linear (n = {n})",
		mode="lines+markers"
	))

	opacity = 1 # 0.3
	
	# Add to bottom as well - the same color scheme
	fig.add_trace(go.Scatter(
		x=dates,
		y=values,
		name="Original",
		mode="lines+markers",
		line={ # default color 1
			"color": colors.qualitative.Plotly[0]
		},
		opacity=opacity,
		showlegend=False,
		),
		row=2,
		col=1,
	)

	fig.add_trace(go.Scatter(
		x=dates_discrete,
		y=values_discrete,
		name="Discretized",
		mode="lines+markers",
		line={ # default color 2
			"color": colors.qualitative.Plotly[1]
		},
		opacity=opacity,
		showlegend=False,
		),
		row=2,
		col=1,
	)

	fig.add_trace(go.Scatter(
		x=dates_discrete_smart,
		y=values_discrete_smart,
		name="Smart discretized",
		mode="lines+markers",
		line={ # default color 3
			"color": colors.qualitative.Plotly[2]
		},
		opacity=opacity,
		showlegend=False,
		),
		row=2,
		col=1,
	)

	fig.add_trace(go.Scatter(
		x=dates_filled_discrete,
		y=values_filled_discrete,
		name="Linear interpolation",
		mode="lines+markers",
		line={ # default color 4
			"color": colors.qualitative.Plotly[3]
		},
		opacity=opacity,
		showlegend=False,
		),
		row=2,
		col=1,
	)

	fig.add_trace(go.Scatter(
		x=dates_filled_smart,
		y=values_filled_smart,
		name=f"Smart + linear (n = {n})",
		mode="lines+markers",
		line={ # default color 5
			"color": colors.qualitative.Plotly[4]
		},
		opacity=opacity,
		showlegend=False,
		),
		row=2,
		col=1,
	)

	# Resize
	fig.update_layout(
		width=1000,
		height=320,
	)

	# Make tight
	fig.update_layout(margin=dict(l=20, r=20, b=20, t=50))

	fig.update_yaxes(title_text="Timeline", row=1, col=1)
	fig.update_yaxes(title_text="Time series", row=2, col=1)

	# Remove y-axis ticks for top plot
	fig.update_yaxes(tickvals=[], row=1, col=1)

	# Reverse legend order
	# fig.update_layout(legend=dict(traceorder="reversed"))

	# Adjusting x-axis tick interval to make it denser
	fig.update_layout(
		xaxis=dict(
			tickmode='linear',
			tick0=dates[0],
			dtick='D1',  # Set the desired tick interval, 'M1' means one month
		),
		xaxis2=dict(
			tickmode='linear',
			tick0=dates[0],
			dtick='D1',  # Set the desired tick interval, 'M1' means one month
		)
	)

	# Hide top row x-axis ticks
	# fig.update_xaxes(tickvals=[], row=1, col=1)

	# Adjust y-axis second row tick interval to show every integer
	fig.update_layout(
		yaxis2=dict(
			tickmode='linear',
			tick0=0,
			dtick=1,  # Set the desired tick interval, 'M1' means one month
		)
	)

	grid_color_v = 255
	grid_color = f"rgb({grid_color_v}, {grid_color_v}, {grid_color_v})"
	# Make all grid lines black with the gridwidth=1
	# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor=grid_color)
	# fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor=grid_color)

	# Give title to the plot
	fig.update_layout(title_text="Differences between the original time series and different imputation methods")
	
	# save figure
	# fig.write_image("src/py/scraping/keepa/imputation-methods.png", scale=3) # scale makes it bigger / resolution higher

	# fig.show(renderer="browser")
	return fig


fig = plot_differences(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]), np.array([
		datetime(2023, 11, 11),
		datetime(2023, 11, 15, 5),
		datetime(2023, 11, 15, 20),
		datetime(2023, 11, 18, 10),
		datetime(2023, 11, 20, 10),
		datetime(2023, 11, 20, 11),
		datetime(2023, 11, 20, 12),
		datetime(2023, 11, 21, 10),
		datetime(2023, 11, 21, 11)
]))

fig.show()

print(f"Notes:")
print(f"* Discretization assumes we get 'end of day' values, however the chart shows 'start of day' values")
print(f"* Basic linear interpolation uses the discretized data")
print(f"* Smart discretization + linear interpolation is the best option")

In [None]:
# Save the plot
filepath = os.path.join(path_visualizations_root, "imputation_methods.png")
fig.write_image(filepath, scale=3)

In [None]:
def get_gaps_from_timeseries(dates: np.array) -> np.array:
	'''
		Returns an array of the gap sizes between days in the time series.
		Assumes the dates are sorted and discretized.
	'''
	gaps = []
	for i in range(len(dates)):
		if i == 0:
			continue
		gap = (dates[i] - dates[i - 1]).days - 1
		if gap > 0:
			gaps.append(gap)
	return np.array(gaps)

def get_gaps_statistics(gaps: np.array) -> dict:
	gaps_statistics = {}
	gaps_statistics["number_of_gaps"] = len(gaps)
	gaps_statistics["average_gap"] = np.mean(gaps)
	gaps_statistics["median_gap"] = np.median(gaps)
	gaps_statistics["number_of_median_gaps"] = np.sum(gaps == np.median(gaps))
	gaps_statistics["max_gap"] = np.max(gaps)
	gaps_statistics["min_gap"] = np.min(gaps)
	return gaps_statistics

def print_gap_statistics(gaps: np.array):
	gaps_statistics = get_gaps_statistics(gaps)
	print(f"Gap statistics:")
	print(f"  Number of gaps: {gaps_statistics['number_of_gaps']}")
	print(f"  Average gap: {gaps_statistics['average_gap']}")
	print(f"  Median gap: {gaps_statistics['median_gap']}")
	print(f"  Number of median gaps: {gaps_statistics['number_of_median_gaps']}")
	print(f"  Max gap: {gaps_statistics['max_gap']}")
	print(f"  Min gap: {gaps_statistics['min_gap']}")

def test_get_gaps_from_timeseries():
	print("Test get_gaps_from_timeseries")
	dates = np.array([
		datetime(2023, 11, 11),
		datetime(2023, 11, 15), # 3 days gap from previous
		datetime(2023, 11, 18), # 2 days gap from previous
		datetime(2023, 11, 21), # 2 days gap from previous
		datetime(2023, 11, 22), # 0 days gap from previous
	])
	gaps = get_gaps_from_timeseries(dates)
	print_gap_statistics(gaps)

test_get_gaps_from_timeseries()


In [None]:
gaps_organized_csv_amazon = get_gaps_from_timeseries(organized_csv["AMAZON"]["AMAZON_time"])
gaps_organized_csv_amazon_statistics = get_gaps_statistics(gaps_organized_csv_amazon)
print(f"Amazon gaps statistics:")
print_gap_statistics(gaps_organized_csv_amazon)

In [None]:
def get_timeseries_statistics(values: np.array, dates: np.array) -> dict:
	'''
		Returns an object with statistics about the time series.
	'''
	timeseries_statistics = {}
	timeseries_statistics["number_of_elements"] = len(values)
	# min and max values need to clear out the NaNs
	values_safe = values[~np.isnan(values)]
	timeseries_statistics["min_value"] = np.min(values_safe)
	timeseries_statistics["max_value"] = np.max(values_safe)
	timeseries_statistics["min_date"] = np.min(dates)
	timeseries_statistics["max_date"] = np.max(dates)
	timeseries_statistics["date_range"] = (timeseries_statistics["max_date"] - timeseries_statistics["min_date"]).days
	timeseries_statistics["days_in_range_coverage_percentage"] = round(timeseries_statistics["number_of_elements"] / timeseries_statistics["date_range"] * 100, 2)
	return timeseries_statistics

organized_csv_amazon_statistics = get_timeseries_statistics(organized_csv["AMAZON"]["AMAZON"], organized_csv["AMAZON"]["AMAZON_time"])
print(f"Amazon timeseries statistics:")
print(f"{json.dumps(organized_csv_amazon_statistics, indent=2, default=str)}")

In [None]:
# Get smart discretized data + linear interpolation of the Amazon data
values_discrete_smart, dates_discrete_smart = discretize_smart(organized_csv["AMAZON"]["AMAZON"], organized_csv["AMAZON"]["AMAZON_time"])
# Try to fill the gaps with the median gap size + 1
values_filled_smart, dates_filled_smart = fill_gaps_linear(values_discrete_smart, dates_discrete_smart, gaps_organized_csv_amazon_statistics["median_gap"] + 1)
# Get the statistics of the filled data
values_filled_smart_statistics = get_timeseries_statistics(values_discrete_smart, dates_discrete_smart)
gaps_organized_csv_amazon_statistics	= get_gaps_statistics(get_gaps_from_timeseries(dates_discrete_smart))
print(f"Amazon smart discretized statistics:")
print(f"{json.dumps(values_filled_smart_statistics, indent=2, default=str)}")
print(f"Amazon smart discretized gaps statistics:")
print_gap_statistics(get_gaps_from_timeseries(dates_discrete_smart))
print("")
values_filled_smart_statistics = get_timeseries_statistics(values_filled_smart, dates_filled_smart)
print(f"Amazon smart discretized + linear interpolation timeseries statistics:")
gaps_organized_csv_amazon_statistics	= get_gaps_statistics(get_gaps_from_timeseries(dates_filled_smart))
print(f"{json.dumps(values_filled_smart_statistics, indent=2, default=str)}")
print(f"Amazon smart discretized + linear interpolation gaps statistics:")
print_gap_statistics(get_gaps_from_timeseries(dates_filled_smart))

In [None]:
# offset the values of the filled data by 10 - for easier visualization
values_filled_smart_offset = values_filled_smart + 50

# Plot both the discretized and the filled data on the same plot
fig1 = go.Figure()
fig1.add_trace(go.Scatter(
	x=dates_discrete_smart,
	y=values_discrete_smart,
	name="Smart disc.",
	mode="lines+markers"
))
fig1.add_trace(go.Scatter(
	x=dates_filled_smart,
	y=values_filled_smart_offset,
	name="Smart disc + linear",
	mode="lines+markers"
))
fig1.show()

# remove Nan values from the original data (dates and values)
denanned_discrete_smart = np.array([x for x in zip(dates_discrete_smart, values_discrete_smart) if not np.isnan(x[1])])
dates_discrete_smart = denanned_discrete_smart[:, 0]
values_discrete_smart = denanned_discrete_smart[:, 1]
denanned_smart_filled = np.array([x for x in zip(dates_filled_smart, values_filled_smart_offset) if not np.isnan(x[1])])
dates_filled_smart = denanned_smart_filled[:, 0]
values_filled_smart = denanned_smart_filled[:, 1]

fig2 = go.Figure()
fig2.add_trace(go.Scatter(
	x=dates_discrete_smart,
	y=values_discrete_smart,
	name="Smart disc.",
	mode="lines+markers"
))
fig2.add_trace(go.Scatter(
	x=dates_filled_smart,
	y=values_filled_smart,
	name="Smart disc + linear",
	mode="lines+markers"
))
fig2.show()

# Findings

* Remove NaN values from the dataset after parsing the csv - before doing any filtering or analysis.
* Rename keys and values for each type of time series from csv to make it easier to work with (values, dates)

In [None]:
def get_root_key(key: str) -> str:
	'''
		Returns the root key of a key.
		Example: "AMAZON" is the root key of "AMAZON_time"
	'''
	if key.endswith("_time"):
		return key[:-5]
	if key.startswith("df_"):
		return key[3:]
	return key

def clean_nan_values(values: np.array, dates: np.array) -> Tuple[np.array, np.array]:
	'''
		Removes all NaN values from the pair of arrays, keeping the dates and values in sync.
	'''
	if len(values) != len(dates):
		raise ValueError('Length of values and dates must be equal.')
	nan_values_mask = np.isnan(values)
	# nan_dates_mask = np.isnan(dates)
	# nan_mask = np.logical_or(nan_values_mask, nan_dates_mask)
	nan_mask = nan_values_mask
	values_clean = values[~nan_mask]
	dates_clean = dates[~nan_mask]
	return values_clean, dates_clean

def organize_csv(parsed_csv: dict) -> dict:
	'''
		Organizes the keys into price / other data categories and returns an object
		Example mapping: "AMAZON": {"values": data_1, "dates": data_2, "df": data_3}
	'''
	organized = {}
	# Organize
	for k, v in parsed_csv.items():
		root_key = get_root_key(k)
		if root_key not in organized:
			organized[root_key] = {}
		if k.endswith("_time"):
			organized[root_key]["dates"] = v
		# if k.startswith("df_"): # idk if we need this since we got values and dates
		# 	organized[root_key]["df"] = v
		if not k.endswith("_time") and not k.startswith("df_"):
			organized[root_key]["values"] = v
	# Clean NaN values
	for k in organized:
		organized[k]["values"], organized[k]["dates"] = clean_nan_values(organized[k]["values"], organized[k]["dates"])

	return organized

def print_organized_csv(organized_csv: dict):
	print(f"Organized csv:")
	for k, v in organized_csv.items():
		print(f"{k}: {v.keys()}")

organized_csv = organize_csv(parsed_csv)
print_organized_csv(organized_csv)

In [None]:
data_amazon = organized_csv["AMAZON"]
values_amazon = data_amazon["values"]
dates_amazon = data_amazon["dates"]
gaps = get_gaps_from_timeseries(dates_amazon)
gaps_statistics = get_gaps_statistics(gaps)
print(f"Amazon gaps statistics:")
print_gap_statistics(gaps)
print(f"Amazon timeseries statistics:")
print(f"{json.dumps(get_timeseries_statistics(values_amazon, dates_amazon), indent=2, default=str)}")
print("")

# Get smart discretized data
values_discrete_smart, dates_discrete_smart = discretize_smart(values_amazon, dates_amazon)
gaps = get_gaps_from_timeseries(dates_discrete_smart)
gaps_statistics	= get_gaps_statistics(gaps)
print(f"Amazon smart discretized gaps statistics:")
print_gap_statistics(gaps)
print(f"Amazon smart discretized timeseries statistics:")
print(f"{json.dumps(get_timeseries_statistics(values_discrete_smart, dates_discrete_smart), indent=2, default=str)}")
print("")

# Get smart discretized data + linear interpolation
values_filled_smart, dates_filled_smart = fill_gaps_linear(values_discrete_smart, dates_discrete_smart, gaps_statistics["median_gap"] + 1)
gaps = get_gaps_from_timeseries(dates_filled_smart)
gaps_statistics	= get_gaps_statistics(gaps)
print(f"Amazon smart discretized + linear interpolation gaps statistics:")
print_gap_statistics(gaps)
print(f"Amazon smart discretized + linear interpolation timeseries statistics:")
print(f"{json.dumps(get_timeseries_statistics(values_filled_smart, dates_filled_smart), indent=2, default=str)}")



In [None]:
# Plot 3 plots

fig = go.Figure()
fig.add_trace(go.Scatter(
	x=dates_amazon,
	y=values_amazon,
	name="Original",
	mode="lines+markers"
))
fig.add_trace(go.Scatter(
	x=dates_discrete_smart,
	y=values_discrete_smart + 50,
	name="Smart disc.",
	mode="lines+markers"
))
fig.add_trace(go.Scatter(
	x=dates_filled_smart,
	y=values_filled_smart + 100,
	name="Smart disc + linear",
	mode="lines+markers"
))
fig.show()