# Invenstigating the Keepa API

In [None]:
import os
import json
import time
import keepa
from keepa.interface import keepa_minutes_to_time, parse_csv
from datetime import datetime, timedelta
import plotly.graph_objs as go
import numpy as np
from scipy.interpolate import interp1d
from typing import Tuple
import pandas as pd
import requests

In [None]:
'''
	Constants and global variables
'''

# Load environment variables

# API key
API_KEY = os.environ.get('KEEPA_API_KEY')

In [None]:
api = keepa.Keepa(API_KEY)

In [None]:
products = api.query('B07B428M7F')

In [None]:
product = products[0]
a = 0


In [None]:
keepa.plot_product(product)

In [None]:
# keepa_minutes_to_time
last_update = keepa_minutes_to_time(product['lastUpdate'])
print(last_update)

In [None]:
parsed_csv = parse_csv(product['csv'])
# print(parsed_csv.keys())
keys = list(parsed_csv.keys())
# # print in threes
# for i in range(0, len(keys), 3):
# 		print(keys[i], keys[i + 1], keys[i + 2])

key_objects = []
for i in range(0, len(keys), 3):
		key_objects.append(
			{
				'time': keys[i],
				'price': keys[i + 1],
				'df': keys[i + 2]
			}
		)
print(json.dumps(key_objects, indent=2))

In [None]:
def get_clean_date(date: datetime) -> datetime:
	'''
		Returns a copy of the datetime object, only keeping the year, month and day.
	'''
	dt = datetime.replace(date, hour=0, minute=0, second=0, microsecond=0)
	return dt

def discretize(arr_values: np.array, arr_dates: np.array) -> Tuple[np.array, np.array]:
	'''
		Converts all dates in a time series to only keep the year, month and day,
		choosing the most recent consecutive date and discarding all others.
	'''
	if len(arr_values) != len(arr_dates):
		raise ValueError('Length of arr_values and arr_dates must be equal.')
	arr_dates_discrete = []
	arr_values_discrete = []
	for i in range(len(arr_values)):
		if i == len(arr_values) - 1:
			arr_dates_discrete.append(get_clean_date(arr_dates[i]))
			arr_values_discrete.append(arr_values[i])
			break
		date = get_clean_date(arr_dates[i])
		value = arr_values[i]
		date_next = get_clean_date(arr_dates[i + 1])
		if date != date_next:
			arr_dates_discrete.append(date)
			arr_values_discrete.append(value)
	return np.array(arr_values_discrete), np.array(arr_dates_discrete)

def fill_missing_dates(arr_values: np.array, arr_dates:np.array) -> Tuple[np.array, np.array]:
	'''
		Fills the missing dates in a time series with NaN values.
		Assumes that the dates are sorted in ascending order, discrete and without duplicates.
		(running discretize() first is recommended)
		This is useful for preparation for imputation methods.
	'''
	if len(arr_values) != len(arr_dates):
		raise ValueError('Length of arr_values and arr_dates must be equal.')
	arr_dates_filled = []
	arr_values_filled = []
	first_date = arr_dates[0]
	missing_dates_count = 0
	for i in range(len(arr_values)):
		if i == len(arr_values) - 1:
			arr_dates_filled.append(arr_dates[i])
			arr_values_filled.append(arr_values[i])
			break
		date = arr_dates[i]
		value = arr_values[i]
		date_next = arr_dates[i + 1]
		if date == date_next:
			continue
		arr_dates_filled.append(date)
		arr_values_filled.append(value)
		while date != date_next:
			missing_dates_count += 1
			date = date + timedelta(days=1)
			if date == date_next:
				break
			arr_dates_filled.append(date)
			arr_values_filled.append(np.nan)
			
			# arr_values_filled.append(-1)
	metadata = {
		'first_date': first_date,
		'last_date': arr_dates[-1],
		'missing_dates_count': len(arr_dates_filled) - len(arr_dates),
		'missing_dates_percentage': (len(arr_dates_filled) - len(arr_dates)) / len(arr_dates_filled) * 100
	}
	return np.array(arr_values_filled), np.array(arr_dates_filled), metadata


In [None]:
fig = go.Figure()
for key in key_objects:
	# impute missing values by taking the last known value
	# parsed_csv[key["price"]] = parsed_csv[key["price"]].replace(-1, np.nan).fillna(method='ffill')
	# interp_func = interp1d(parsed_csv[key["time"]], parsed_csv[key["price"]], kind='linear', fill_value='extrapolate')
	# parse_csv[key["price"]] = interp_func(parsed_csv[key["time"]])
	
	price_discrete, date_discrete = discretize(parsed_csv[key["price"]], parsed_csv[key["time"]])
	price_filled, date_filled, metadata = fill_missing_dates(price_discrete, date_discrete)
	# create a df from the filled values and dates, where the dates are the index
	df = pd.DataFrame({"values": price_filled}, index=date_filled)
	# print(df.head(20))
	df = df.interpolate(method='linear', limit_direction='both')
	

	interpolated_dates = df.index
	interpolated_values = df["values"]
	print(f"Key {key['price']} has {metadata['missing_dates_count']} missing dates ({metadata['missing_dates_percentage']}%)")
	fig.add_trace(go.Scatter(x=interpolated_dates, y=interpolated_values, name=key["price"]))#, mode='markers'))
fig.update_layout(title='Price vs Time',
									xaxis_title='Time',
									yaxis_title='Price',
									showlegend=True
)
# add 
# fig.update_layout(width=1600, height=1200)
# hide all but AMAZON, NEW, USED legend
for i in range(3, len(fig.data)):
	fig.data[i].visible = 'legendonly'
fig.show()

In [None]:
# Plotly graph AMAZON on x and AMAZON_time on y
fig = go.Figure()
fig.add_trace(go.Scatter(x=parsed_csv['AMAZON_time'], y=parsed_csv['AMAZON'], name='AMAZON'))
# add legend
fig.update_layout(
	legend=dict(
		yanchor="top",
		y=0.99,
		xanchor="left",
		x=0.01
	)
)

In [None]:
# pretty print the product
print(json.dumps(product, indent=2, default=str))#, sort_keys=True))

In [None]:
# Product search

product_params = {
	# "title":"ryzen 7 2700x",
  # "categories_include": [
  #     229189, # "CPU Processors"
  #     # 8588809011,
  #     # 13900851
	# ],
	"title": "amd ryzen 7 2700x"
	# "title": "AD4U266638G19-S"
	# "title": "intel core i7 10700k"
}
products = api.product_finder(product_params)



In [None]:
print(f"len: {len(products)}")
print(json.dumps(products, indent=2, default=str))#, sort_keys=True))

In [None]:
product_params = {'title': 'amd ryzen 7 2700x'}
# add information to the product
products = api.product_finder(product_params)
print(products)

In [None]:
# Product search - manual request
# https://keepa.com/#!discuss/t/product-searches/109

def make_request(url: str, params: dict) -> Tuple[dict, None] | Tuple[None, str]:
	'''
		Makes a request to the url with the given parameters.
		Returns the response as a dictionary.
	'''
	try:
		response = requests.get(url, params=params)
		response.raise_for_status()
		return response.json(), None
	except requests.exceptions.HTTPError as err:
		return None, err

def search_products(term: str) -> dict:
	'''
		Searches for products with the given term.
		Returns the response as a dictionary.
	'''
	# /search?key=<yourAccessKey>&domain=<domainId>&type=product&term=<searchTerm>
	base_url = 'https://api.keepa.com/search'
	params = {
		'key': API_KEY,
		'page': 0,
		'domain': 1,
		'type': 'product',
		'term': term
	}
	response, err = make_request(base_url, params)
	if err:
		raise err
	assert response is not None
	return response



In [None]:
searched_products_response = search_products('amd ryzen 7 2700x')
# print all keys and values except for "products" - print just the length of "products"
for key, value in searched_products_response.items():
	if key != 'products':
		print(f"{key}: {value}")
	else:
		print(f"{key}: {len(value)} products")

In [None]:
searched_products = searched_products_response["products"]
print(f"products count: {len(searched_products)}")

In [None]:
def print_product_ASINs(products: dict) -> None:
	'''
		Prints the ASINs of the products in the given dictionary.
	'''
	print(f"len: {len(products)}")
	for product in products:
		print(f"{product['asin']}  -  {product['title']}")

print_product_ASINs(searched_products)

In [None]:
# How to optimize the search?
# - return only n (10) results at a time
# - only return a list of product ASINs and titles
# - use fuzzy string matching
# - return product data only after filtering the products

In [None]:
single_product = searched_products[0]
parsed_csv = parse_csv(single_product['csv'])
keys = list(parsed_csv.keys())
key_objects = []
for i in range(0, len(keys), 3):
		key_objects.append(
			{
				'time': keys[i],
				'price': keys[i + 1],
				'df': keys[i + 2]
			}
		)
print(json.dumps(key_objects, indent=2))

In [None]:
fig = go.Figure()
for key in key_objects:
	# impute missing values by taking the last known value
	# parsed_csv[key["price"]] = parsed_csv[key["price"]].replace(-1, np.nan).fillna(method='ffill')
	# interp_func = interp1d(parsed_csv[key["time"]], parsed_csv[key["price"]], kind='linear', fill_value='extrapolate')
	# parse_csv[key["price"]] = interp_func(parsed_csv[key["time"]])
	
	price_discrete, date_discrete = discretize(parsed_csv[key["price"]], parsed_csv[key["time"]])
	price_filled, date_filled, metadata = fill_missing_dates(price_discrete, date_discrete)
	# create a df from the filled values and dates, where the dates are the index
	df = pd.DataFrame({"values": price_filled}, index=date_filled)
	# print(df.head(20))
	df = df.interpolate(method='linear', limit_direction='both')
	

	interpolated_dates = df.index
	interpolated_values = df["values"]
	print(f"Key {key['price']} has {metadata['missing_dates_count']} missing dates ({metadata['missing_dates_percentage']}%)")
	fig.add_trace(go.Scatter(x=interpolated_dates, y=interpolated_values, name=key["price"]))#, mode='markers'))
fig.update_layout(title='Price vs Time',
									xaxis_title='Time',
									yaxis_title='Price',
									showlegend=True
)
# add 
# fig.update_layout(width=1600, height=1200)
# hide all but AMAZON, NEW, USED legend
for i in range(3, len(fig.data)):
	fig.data[i].visible = 'legendonly'
fig.show()