In [None]:
import os
import sys
import json
import random
import pandas as pd
import numpy as np
from datetime import datetime
from plotly import graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from scipy import stats
from typing import Optional

# Project imports
sys.path.append(os.getcwd())
from src.py.analysis.yahoo.stocks.finance_df_utils import load_stock_csv, df_add_data, get_figure, filter_df_by_date, add_vline_annotation, save_fig, get_safe_filename, get_grouped_df
from src.py.analysis.events import events
# CPI
import importlib
sys.path.append(os.getcwd())
cpi_adjust = importlib.import_module("src.py.scraping.world-bank.cpi_adjust")
cpi_adjust.initialize_cpi(date_cutoff="2018-08-01", jagged=False) # jagged=True)


In [None]:
path_index = "data/scraped/yahoo/crypto/index.json"
path_csv_root = "data/scraped/yahoo/crypto/csv"
path_output_root = "data/analysis/yahoo/crypto"

if os.path.exists(path_output_root) is False:
	os.makedirs(path_output_root)

In [None]:
# Load index
index = json.load(open(path_index, "r"))["urls"] # type: list
print(f"Loaded index with {len(index)} crypto currency links")
print("")
print(f"First entry: '{index[0]}'")

# Parse just the crypto symbol from url
index = [x.split("/")[-1].split("?")[0] for x in index]
print(f"First entry (parsed): '{index[0]}'")

In [None]:
# Load all CSVs
dfs_dict = {}
dfs_fails = {}
print(f"Loading {len(index)} crypto CSVs.")
print("")
for i, symbol in enumerate(index):
	print(f"{i+1}/{len(index)} [{symbol}]                  ", end="\r")
	try:
		# Load CSV
		df = load_stock_csv(os.path.join(path_csv_root, f"{symbol}.csv"))
		first_date = str(df.index[0])
		# Check if data is within range (first date < 2018-10-01)
		if df.index[0] > datetime(2018, 8, 1):
			raise Exception("Data starts after 2018-08-01.")
		# Check if data is within range (last date > 2023-12-01)
		if df.index[-1] < datetime(2023, 12, 1):
			raise Exception("Data ends before 2023-12-01.")
		# Plotting interval is 2019-01-01 to 2023-12-26
		# start_date is set to 2018-10-01 to leave some for window
		df = filter_df_by_date(df, start_date="2018-08-01", end_date="2023-12-26")
		# Adjust for CPI
		# df = cpi_adjust.adjust_for_inflation(df, "USA", columns=["Open", "High", "Low", "Close", "Adj Close"])
		df = cpi_adjust.adjust_for_inflation(df, "USA", columns=["Open", "High", "Low", "Close", "Adj Close", "Volume"])
		# Add to dict
		dfs_dict[symbol] = df
	except Exception as e:
		if "no such file or directory" in str(e).lower():
			e = "No CSV file."
		dfs_fails[symbol] = str(e)
	# break # for testing (quicker iteration)
print("")
print(f"Loaded {len(dfs_dict)} crypto CSVs.")

In [None]:
df_bitcoin = dfs_dict["BTC-USD"].copy()
df_bitcoin = df_add_data(df_bitcoin)
df_bitcoin = df_bitcoin.loc["2019-01-01":]
options = {
 "w": 1280,
 "h": 720,
 # "traces": ["candlestick"],
 "margin": {"r": 0, "t": 60, "b": 0, "l": 0},
 "labels": ["Price ($)", "Volume", "MACD"],
}
fig = get_figure(df_bitcoin, title="Bitcoin (BTC-USD) price trend", options=options)
for event in events:
	event_copy = event.copy()
	if "offset" in event_copy:
		del event_copy["offset"]
	if event_copy["group"] == "Crypto":
		add_vline_annotation(fig, event_copy)
fig.show()

output_path = os.path.join(path_output_root, "bitcoin_price_trend_daily.png")
save_fig(fig, output_path, 3)

In [None]:
df_bitcoin = dfs_dict["BTC-USD"].copy()
df_bitcoin = df_bitcoin.resample("W").agg({
	    "Open": "first",
	    "High": "max",
	    "Low": "min",
	    "Close": "last",
	    "Adj Close": "last",
	    "Volume": "sum"
	})
df_bitcoin = df_add_data(df_bitcoin)
df_bitcoin = df_bitcoin.loc["2019-01-01":]
options = {
 "w": 1280,
 "h": 720,
 # "traces": ["candlestick"],
 "margin": {"r": 0, "t": 60, "b": 0, "l": 0},
 "labels": ["Price ($)", "Volume", "MACD"],
}
fig = get_figure(df_bitcoin, title="Bitcoin (BTC-USD) price trend", options=options)
for event in events:
	event_copy = event.copy()
	if "offset" in event_copy:
		del event_copy["offset"]
	if event_copy["group"] == "Crypto":
		add_vline_annotation(fig, event_copy)
fig.show()

output_path = os.path.join(path_output_root, "bitcoin_price_trend_weekly.png")
save_fig(fig, output_path, 3)

In [None]:
df_bitcoin = dfs_dict["BTC-USD"].copy()
df_bitcoin = df_add_data(df_bitcoin)
columns = list(df_bitcoin.columns)
df_bitcoin_temp = df_bitcoin.copy()
df_bitcoin_temp = df_bitcoin_temp.resample("W").agg({
	"Open": "first",
	"High": "max",
	"Low": "min",
	"Close": "last",
	"Adj Close": "last",
	"Volume": "sum"
})
df_bitcoin = df_bitcoin.resample("W").mean()
df_bitcoin["Open"] = df_bitcoin_temp["Open"]
df_bitcoin["High"] = df_bitcoin_temp["High"]
df_bitcoin["Low"] = df_bitcoin_temp["Low"]
df_bitcoin["Close"] = df_bitcoin_temp["Close"]
df_bitcoin["Adj Close"] = df_bitcoin_temp["Adj Close"]
df_bitcoin["Volume"] = df_bitcoin_temp["Volume"]
df_bitcoin = df_bitcoin.loc["2019-01-01":]
options = {
 "w": 1280,
 "h": 720,
 # "traces": ["candlestick"],
 "margin": {"r": 0, "t": 60, "b": 0, "l": 0},
 "labels": ["Price ($)", "Volume", "MACD"],
}
fig = get_figure(df_bitcoin, title="Bitcoin (BTC-USD) price trend", options=options)
for event in events:
	event_copy = event.copy()
	if "offset" in event_copy:
		del event_copy["offset"]
	if event_copy["group"] == "Crypto":
		add_vline_annotation(fig, event_copy)
fig.show()

output_path = os.path.join(path_output_root, "bitcoin_price_trend_daily_then_weekly.png")
save_fig(fig, output_path, 3)

In [None]:
# Print fails grouped and sorted by error
print(f"Failed to load {len(dfs_fails)} crypto CSVs.")
print("")
dfs_fails_grouped = { str(e): [] for e in set(dfs_fails.values()) }
for ticker, e in dfs_fails.items():
	dfs_fails_grouped[str(e)].append(ticker)

for e, tickers in sorted(dfs_fails_grouped.items(), key=lambda x: len(x[1]), reverse=True):
	print(f"{len(tickers)} cryptos: {e}")
	print(f"{', '.join(tickers[:10])}...")
	print("")


In [None]:
# Print first 10 unique crypto symbols
print(f"First 10 crypto symbols:")
for i, symbol in enumerate(list(dfs_dict.keys())[:10]):
	print(f"{i+1:2d}: {symbol}")

In [None]:
# Add Symbol column to start of each DataFrame
for symbol, df in dfs_dict.items():
	if "Symbol" in df.columns: # ensure idempotence
		continue
	df["Symbol"] = symbol
	df = df[["Symbol"] + df.columns[:-1].tolist()]
	dfs_dict[symbol] = df

dfs_dict["BTC-USD"].head()

In [None]:
dfs_dict["BTC-USD"].tail()

In [None]:
# add Volume_value column by multiplying Volume by Close
def add_volume_value(df: pd.DataFrame) -> pd.DataFrame:
	df["Volume_value"] = df["Volume"] * df["Close"]
	return df

df_volume_value = add_volume_value(dfs_dict["BTC-USD"])
df_volume_value.head()

In [None]:
# add Open_I, High_I, Low_I, Close_I, Adj_Close_I, Volume_I columns using a specified date as I = 100
def add_index_values(df: pd.DataFrame, date: str) -> pd.DataFrame:
	index_value = df.loc[date, "Open"]
	df["Open_I"] = df["Open"] / index_value * 100  # type: ignore
	index_value = df.loc[date, "High"]
	df["High_I"] = df["High"] / index_value * 100  # type: ignore
	index_value = df.loc[date, "Low"]
	df["Low_I"] = df["Low"] / index_value * 100  # type: ignore
	index_value = df.loc[date, "Close"]
	df["Close_I"] = df["Close"] / index_value * 100  # type: ignore
	index_value = df.loc[date, "Adj Close"]
	df["Adj_Close_I"] = df["Adj Close"] / index_value * 100  # type: ignore
	# index_value = df.loc[date, "Volume"]
	# df["Volume_I"] = df["Volume"] / index_value * 100  # type: ignore
	# NOTE: this approach can scramble the the values because of relative scaling
	# 		  as a consequence, the values like High_I can be lower than Low_I
	# # Invert the values where High < Low
	# df["temp"] = df["High_I"]
	# df.loc[df["High_I"] < df["Low_I"],
	#        "High_I"] = df.loc[df["High_I"] < df["Low_I"], "Low_I"]
	# df.loc[df["High_I"] < df["Low_I"],
	#        "Low_I"] = df.loc[df["High_I"] < df["Low_I"], "temp"]
	# # Switch the Close_I, Open_I for all rows where their order is different from Close, Open
	# rows_og = df["Close"] < df["Open"]
	# rows_i = df["Close_I"] < df["Open_I"]
	# mask = rows_og != rows_i
	# count_pre = mask.sum()
	# df.loc[mask, "temp"] = df.loc[mask, "Close_I"]
	# df.loc[mask, "Close_I"] = df.loc[mask, "Open_I"]
	# df.loc[mask, "Open_I"] = df.loc[mask, "temp"]
	# rows_og = df["Close"] < df["Open"]
	# rows_i = df["Close_I"] < df["Open_I"]
	# mask = rows_og != rows_i
	# count_post = mask.sum()
	# # Drop temp column
	# df.drop(columns=["temp"], inplace=True)
	# NOTE: a more complete approach would be to scale Open_I, High_I, and Low_I proportionally to Open, High, and Low using Close_I as a base
	#       calculated from the relative scaling of Close_I and Close - each daily measurement / candle should retain the same proportions and order
	return df

# 2019-01-01 = 100
df_scaled = add_index_values(dfs_dict["BTC-USD"], "2019-01-01")
# df_scaled.head()
# show df from 2019-01-01 to 2019-01-05
df_scaled.loc["2019-01-01":"2019-01-05"]


In [None]:
# plot Bitcoin's Close data

In [None]:
# Process all DataFrames to add index values
for symbol, df in dfs_dict.items():
	df = add_index_values(df, "2019-01-01")
	dfs_dict[symbol] = df

df_scaled = dfs_dict["BTC-USD"]
df_scaled.loc["2019-01-01":"2019-01-05"]

In [None]:
# Plot first 5 crypto symbols on the same plot
# fig = px.line()
# for symbol, df in list(dfs_dict.items())[:5]:
# 	# fig.add_trace(go.Scatter(x=df.index, y=df["Close"], name=symbol))
# 	fig.add_trace(go.Scatter(x=df.index, y=df["Close_I"], name=symbol))
# fig.update_layout(title="First 5 crypto symbols")
# fig.show()

# Make 2 subplots
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.02)

# Plot first 5 crypto symbols - Close on top plot and Close_I on bottom plot
# use the same color for each symbol in both plots
for i, (symbol, df) in enumerate(list(dfs_dict.items())[:5]):
	# fig.add_trace(go.Scatter(x=df.index, y=df["Close"], name=symbol), row=1, col=1)
	# fig.add_trace(go.Scatter(x=df.index, y=df["Close_I"], name=symbol), row=2, col=1)
	color = px.colors.qualitative.Plotly[i]
	fig.add_trace(go.Scatter(x=df.index, y=df["Close"], name=f"{symbol} Close", line=dict(color=color)), row=1, col=1)
	fig.add_trace(go.Scatter(x=df.index, y=df["Close_I"], name=f"{symbol} Close_I", line=dict(color=color)), row=2, col=1)
# add axes labels
fig.update_xaxes(title_text="Date", row=2, col=1)
fig.update_yaxes(title_text="Close", row=1, col=1)
fig.update_yaxes(title_text="Close_I", row=2, col=1)
fig.update_layout(title="Currency prices versus index values for the first 5 crypto symbols (2019-01-01 = 100)")
fig.update_layout(width=1000, height=400)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=40, pad=0))
fig.show()

# Save the figure
path_fig = os.path.join(path_output_root, "crypto-close-vs-close-i-first-5.png")
# 2x zoom
save_fig(fig, path_fig, 2)

In [None]:
# DONE: move to utils

# events = []

# # COVID-19 market crash
# events.append({
# 	"date": datetime(2020, 2, 20),
# 	"annotation": "MC",
# 	"description": "COVID-19 market crash"
# })

# # COVID-19 market crash end
# events.append({
# 	"date": datetime(2020, 4, 7),
# 	"annotation": "MC end",
# 	"description": "COVID-19 market crash end"
# })

In [None]:
# def fix_candles(df: pd.DataFrame) -> pd.DataFrame:
# 	# Rules:
# 	# 1. Low <= (Open <= Close || Open >= Close) <= High

# 	# Process:
# 	# set the lo

# 	# Old attempt (incomplete)
# 	df_n["temp"] = df_n["High"]
# 	df_n.loc[df_n["High"] < df_n["Low"],
# 	       "High"] = df_n.loc[df_n["High"] < df_n["Low"], "Low"]
# 	df_n.loc[df_n["High"] < df_n["Low"],
# 	       "Low"] = df_n.loc[df_n["High"] < df_n["Low"], "temp"]
# 	# Switch the Close, Open for all rows where their order is different from Close, Open
# 	rows_og = df_n["Close"] < df_n["Open"]
# 	rows = df_n["Close"] < df_n["Open"]
# 	mask = rows_og != rows
# 	count_pre = mask.sum()
# 	df_n.loc[mask, "temp"] = df_n.loc[mask, "Close"]
# 	df_n.loc[mask, "Close"] = df_n.loc[mask, "Open"]
# 	df_n.loc[mask, "Open"] = df_n.loc[mask, "temp"]
# 	rows_og = df_n["Close"] < df_n["Open"]
# 	rows = df_n["Close"] < df_n["Open"]
# 	mask = rows_og != rows
# 	count_post = mask.sum()
# 	# Drop temp column
# 	df_n.drop(columns=["temp"], inplace=True)

# def get_grouped_fig(dfs_dict: dict,
#                     symbols: list,
#                     title: str,
#                     events: list,
#                     options: Optional[dict] = None) -> go.Figure:
# 	dfs_dict_filtered = {symbol: dfs_dict[symbol] for symbol in symbols}
# 	df_n = pd.concat(dfs_dict_filtered.copy().values())
# 	# df_n = df_n.copy()
# 	df_n = get_grouped_df(df_n, start_date="2019-01-01")
# 	# resample to weekly
# 	df_n = df_n.resample("W").mean()
# 	# df_n = df_n.resample("W").agg({
# 	# 	"Open": "mean",
# 	# 	"High": "mean",
# 	# 	"Low": "mean",
# 	# 	"Close": "mean",
# 	# 	"Adj Close": "mean",
# 	# 	"Volume": "sum",
# 	# })
# 	# df_n = df_n.resample("W").agg({
# 	# 	"Open": "first",
# 	# 	"High": "max",
# 	# 	"Low": "min",
# 	# 	"Close": "last",
# 	# 	"Adj Close": "last",
# 	# 	"Volume": "sum",
# 	# })
# 	# df_n = get_grouped_df(df_n, start_date="2019-01-01")

# 	fig = get_figure(df_n, title, options)
# 	for event in events:
# 		add_vline_annotation(fig, event, textangle=-20)
# 	return fig

def get_grouped_fig_alt(dfs_dict: dict,
                    symbols: list,
                    title: str,
                    events: list,
                    options: Optional[dict] = None) -> go.Figure:
	dfs_dict_filtered = {symbol: dfs_dict[symbol] for symbol in symbols}
	df_n = pd.concat(dfs_dict_filtered.values())
	df_n = df_n.copy()
	# df_n = get_grouped_df(df_n, start_date="2019-01-01")
	# resample to weekly
	df_n = df_n.resample("W").agg({
		"Open": "mean",
		"High": "mean",
		"Low": "mean",
		"Close": "mean",
		"Adj Close": "mean",
		"Volume": "sum",
	})
	# df_n = df_n.resample("W").agg({
	# 	"Open": "first",
	# 	"High": "max",
	# 	"Low": "min",
	# 	"Close": "last",
	# 	"Adj Close": "last",
	# 	"Volume": "sum",
	# })
	df_n = get_grouped_df(df_n, start_date="2019-01-01")

	fig = get_figure(df_n, title, options)
	for event in events:
		add_vline_annotation(fig, event, textangle=-20)
	return fig


def get_grouped_fig(dfs_dict: dict,
                    symbols: list,
                    title: str,
                    events: list,
                    options: Optional[dict] = None) -> go.Figure:
	dfs_dict_filtered = {symbol: dfs_dict[symbol] for symbol in symbols}
	df_n = pd.concat(dfs_dict_filtered.copy().values())
	# df_n = df_n.copy()
	df_n = get_grouped_df(df_n, start_date="2019-01-01")
	# resample to weekly
	volume = df_n["Volume"].resample("W").sum()
	volume_sma = df_n["SMA_volume"].resample("W").sum()
	df_n = df_n.resample("W").mean()
	df_n["Volume"] = volume
	df_n["SMA_volume"] = volume_sma
	# df_n = df_n.resample("W").agg({
	# 	"Open": "mean",
	# 	"High": "mean",
	# 	"Low": "mean",
	# 	"Close": "mean",
	# 	"Adj Close": "mean",
	# 	"Volume": "sum",
	# })
	# df_n = df_n.resample("W").agg({
	# 	"Open": "first",
	# 	"High": "max",
	# 	"Low": "min",
	# 	"Close": "last",
	# 	"Adj Close": "last",
	# 	"Volume": "sum",
	# })
	# df_n = get_grouped_df(df_n, start_date="2019-01-01")
	fig = get_figure(df_n, title, options)
	for event in events:
		add_vline_annotation(fig, event, textangle=-20)
	return fig

# def get_grouped_fig(dfs_dict: dict,
#                     symbols: list,
#                     title: str,
#                     events: list,
#                     options: Optional[dict] = None) -> go.Figure:
# 	dfs_dict_filtered = {symbol: dfs_dict[symbol] for symbol in symbols}
# 	df_n = pd.concat(dfs_dict_filtered.copy().values())
# 	# df_n = df_n.copy()
	
# 	df_n = get_grouped_df(df_n, start_date="2019-01-01")
	
# 	# resample to weekly
# 	volume = df_n["Volume"].resample("W").sum()
# 	volume_sma = df_n["SMA_volume"].resample("W").sum()
# 	df_n = df_n.resample("W").mean()
# 	df_n["Volume"] = volume
# 	df_n["SMA_volume"] = volume_sma
	
# 	# df_n = df_n.resample("W").agg({
# 	# 	"Open": "mean",
# 	# 	"High": "mean",
# 	# 	"Low": "mean",
# 	# 	"Close": "mean",
# 	# 	"Adj Close": "mean",
# 	# 	"Volume": "sum",
# 	# })
# 	# df_n = df_n.resample("W").agg({
# 	# 	"Open": "first",
# 	# 	"High": "max",
# 	# 	"Low": "min",
# 	# 	"Close": "last",
# 	# 	"Adj Close": "last",
# 	# 	"Volume": "sum",
# 	# })
# 	# df_n = get_grouped_df(df_n, start_date="2019-01-01")

# 	fig = get_figure(df_n, title, options)
# 	# for event in events:
# 	# 	add_vline_annotation(fig, event, textangle=-20)
# 	return fig

# # Example with just first entry (BTC-USD)
btc_symbol = list(dfs_dict.keys())[0]
options = {
 "w": 1280,
 "h": 720,
 # "traces": ["candlestick"],
 "traces": [],
 "margin": {"r": 0, "t": 60, "b": 0, "l": 0},
 "labels": ["Price ($)", "Volume", "MACD"],
}
fig = get_grouped_fig(dfs_dict, [btc_symbol], f"Bitcoin ({btc_symbol}) price trend", events, options)
fig.show()

# # Just single plot
# btc_df = dfs_dict["BTC-USD"].copy()
# btc_df = btc_df.resample("W").agg({
# 		"Open": "first",
# 		"High": "max",
# 		"Low": "min",
# 		"Close": "last",
# 		"Adj Close": "last",
# 		"Volume": "sum",
# 	})
# btc_df = get_grouped_df(btc_df, start_date="2019-01-01")
# fig = get_figure(btc_df, f"Bitcoin ({btc_symbol}) price trend", options)
# for event in events:
# 	add_vline_annotation(fig, event, textangle=-20)
# fig.show()


# Save the figure
path_fig = os.path.join(path_output_root, f"bitcoin-price-trend.png")
save_fig(fig, path_fig, 3)

In [None]:
# Replace column values of Open with Open_I, High with High_I, Low with Low_I, Close with Close_I, Adj Close with Adj_Close_I, Volume with Volume_I
for symbol, df in dfs_dict.items():
	if "Open_I" in df.columns:
		# drop original columns
		df = df.drop(
		    columns=["Open", "High", "Low", "Close", "Adj Close"])#, "Volume"])
		# rename columns
		df = df.rename(
		    columns={
		        "Open_I": "Open",
		        "High_I": "High",
		        "Low_I": "Low",
		        "Close_I": "Close",
		        "Adj_Close_I": "Adj Close",
		        # "Volume_I": "Volume"
		    })
	dfs_dict[symbol] = df

# show btc from 2019-01-01 to 2019-01-05
dfs_dict["BTC-USD"].loc["2019-01-01":"2019-01-05"]

In [None]:
# # Impute extreme values (using a rolling window) by replacing them with the mean of the window
# def impute_extreme_values(df: pd.DataFrame, window: int) -> pd.DataFrame:
# 	df["Open"] = df["Open"].rolling(window=window, min_periods=1).mean()
# 	df["High"] = df["High"].rolling(window=window, min_periods=1).mean()
# 	df["Low"] = df["Low"].rolling(window=window, min_periods=1).mean()
# 	df["Close"] = df["Close"].rolling(window=window, min_periods=1).mean()
# 	df["Adj Close"] = df["Adj Close"].rolling(window=window, min_periods=1).mean()
# 	df["Volume"] = df["Volume"].rolling(window=window, min_periods=1).mean()
# 	return df

# for symbol, df in dfs_dict.items():
# 	df = impute_extreme_values(df, 7)
# 	dfs_dict[symbol] = df
# dfs_dict["BTC-USD"].loc["2019-01-01":"2019-01-05"]

In [None]:
def remove_outliers_moving(arr: np.ndarray,
                           window_size: int = 7,
                           threshold: float = 3) -> np.ndarray:
	arr = arr.astype(float)
	# if arr is [1,2,3,4,5] and window size is 2, then windows arr is [[1,2], [3,4], [5]]
	remainder = len(arr) % window_size
	arr, remainder_arr = arr[:-remainder], arr[-remainder:]
	num_subarrays = int(len(arr) / window_size)
	windows = np.array_split(arr, num_subarrays)
	# add window_size - remainder elements from the end of the last subarray to the remainder_arr to make windows homogeneous
	missing = window_size - remainder
	if missing > 0:
		remainder_arr = np.append(windows[-1][-missing:], remainder_arr)
	windows.append(remainder_arr)
	windows = np.array(windows)
	# means = np.mean(windows, axis=1)
	for window in windows:  # alternatively calculate odd one out for each window
		# # detect outliers
		# mean = np.median(window)  # np.mean(window)
		# std = np.std(window)
		# outliers = np.array(np.abs(window - mean) > threshold * std)
		# detect outliers - alternative method
		median = np.median(window)
		iqr = np.percentile(window, 75) - np.percentile(window, 25)
		outliers = np.array(np.abs(window - median) > threshold * iqr)
		# calculate mean without outliers
		# window_no_outliers = window[outliers == False]
		window_no_outliers = window[~outliers]
		if len(window_no_outliers) == 0 or len(window_no_outliers) == len(window):
			continue
		mean_no_outliers = np.mean(window_no_outliers)
		window[outliers] = mean_no_outliers
	# flatten windows
	arr_no_outliers = np.concatenate(windows)
	if missing > 0:
		arr_no_outliers, remainder_arr = arr_no_outliers[:
		                                                 -window_size], arr_no_outliers[
		                                                     -window_size:]
		arr_no_outliers = np.append(arr_no_outliers, remainder_arr[-missing:])
	return arr_no_outliers
	a = 0

arr = np.array([1, 2, 3, 4, 5, 50, 7, 8, 9, 10])
window = 4
threshold = 3
print(list(arr))
arr = remove_outliers_moving(arr, window, threshold)
print(list(arr))

In [None]:
# # Remove outliers from all DataFrames
# for i, (symbol, df) in enumerate(dfs_dict.items()):
# 	print(f"{i+1}/{len(dfs_dict)} [{symbol}]                  ", end="\r")
# 	# processed = remove_outliers_moving(df["Close"].values,
# 	#                                    window_size=7,
# 	#                                    threshold=3)
# 	# df["Close"] = processed
# 	for col in ["Open", "High", "Low", "Close", "Adj Close", "Volume"]:
# 		processed = remove_outliers_moving(df[col].values, window_size=7, threshold=3)
# 		df[col] = processed
# 	dfs_dict[symbol] = df
# print("")

# # show btc from 2019-01-01 to 2019-01-05
# dfs_dict["BTC-USD"].loc["2019-01-01":"2019-01-05"]

In [None]:
# THIS WORKS WAY QUICKER

# Impute outliers from all DataFrames using z-score with
z_threshold = 20 #3 # - set depending on specific data
for i, (symbol, df) in enumerate(dfs_dict.items()):
	print(f"{i+1}/{len(dfs_dict)} [{symbol}]                  ", end="\r")
	for col in ["Open", "High", "Low", "Close", "Adj Close", "Volume"]:
		z = np.abs(stats.zscore(df[col]))
		# directly set outliers to NaN
		mask = np.array(z > z_threshold)
		# df[col][mask] = np.nan # this produces warnings "A value is trying to be set on a copy of a slice from a DataFrame"
		df.loc[mask, col] = np.nan
		# outliers = np.where(z > z_threshold)
		# outliers = df[col][z > z_threshold]
		# if len(outliers[0]) > 0:
		# 	a = 0
		# df = df[(z < z_threshold)]
		# replace outliers with NaN
		# df[col].iloc[outliers] = np.nan
		# linear interpolation to fill NaN
		df[col] = df[col].interpolate()
	dfs_dict[symbol] = df

print("")
# show btc from 2019-01-01 to 2019-01-05
dfs_dict["BTC-USD"].loc["2019-01-01":"2019-01-05"]

In [None]:
# Get first n symbols
n = 10000  # 10000 # set some big number to get all
n = 1000 # CLT - Central Limit Theorem minimum number of samples
n = min(n, len(dfs_dict)) # ensure n is not bigger than number of symbols
# symbols = list(dfs_dict.keys())[:n]  # first n
# symbols = random.sample(list(dfs_dict.keys()), n) # random n
sampling_type = "first"  # "random" or "first"
if sampling_type == "random":
	symbols = random.sample(list(dfs_dict.keys()), n)  # random n
elif sampling_type == "first":
	symbols = list(dfs_dict.keys())[:n]  # first n
else:
	raise ValueError("Invalid sampling_type")
print(f"Selected {len(symbols)} random symbols:")
for i, symbol in enumerate(symbols):
	print(f"{symbol.replace('-USD', '')}, ", end="")
	if i > 10:
		print("")
		print("...")
		break
print("")
options = {
 	"w": 1280,
 	"h": 720,
#  "traces": ["candlestick"],
 	"traces": [],
	"color_changes": False,
 	"margin": {"r": 0, "t": 60, "b": 0, "l": 0},
 	"labels": ["Index values", "Volume", "MACD"],
}
# fig = get_grouped_fig(dfs_dict.copy(), symbols, f"Market trends for {sampling_type} {n}/{len(dfs_dict)} cryptocurrencies | 2019-01-01 = 100", events, options)
fig = get_grouped_fig_alt(dfs_dict.copy(), symbols, f"Market trend for {sampling_type} {n} cryptocurrencies | 2019-01-01 = 100", events, options)
fig.show()

# Save figure
filename = f"crypto-market-trends-{sampling_type}-{len(symbols)}-weekly"
path_output = os.path.join(path_output_root, f"{filename}.png")
save_fig(fig, path_output, scale=3)

In [None]:
# Get first n symbols
n = 10000  # 10000 # set some big number to get all
n = 1000 # CLT - Central Limit Theorem minimum number of samples
n = min(n, len(dfs_dict)) # ensure n is not bigger than number of symbols
# symbols = list(dfs_dict.keys())[:n]  # first n
# symbols = random.sample(list(dfs_dict.keys()), n) # random n
sampling_type = "first"  # "random" or "first"
if sampling_type == "random":
	symbols = random.sample(list(dfs_dict.keys()), n)  # random n
elif sampling_type == "first":
	symbols = list(dfs_dict.keys())[:n]  # first n
else:
	raise ValueError("Invalid sampling_type")
print(f"Selected {len(symbols)} random symbols:")
for i, symbol in enumerate(symbols):
	print(f"{symbol.replace('-USD', '')}, ", end="")
	if i > 10:
		print("")
		print("...")
		break
print("")
options = {
 	"w": 1280,
 	"h": 720,
#  "traces": ["candlestick"],
 	"traces": [],
	"color_changes": False,
 	"margin": {"r": 0, "t": 60, "b": 0, "l": 0},
 	"labels": ["Index values", "Volume", "MACD"],
}
# fig = get_grouped_fig(dfs_dict.copy(), symbols, f"Market trends for {sampling_type} {n}/{len(dfs_dict)} cryptocurrencies | 2019-01-01 = 100", events, options)
fig = get_grouped_fig(dfs_dict.copy(), symbols, f"Market trend for {sampling_type} {n} cryptocurrencies | 2019-01-01 = 100", events, options)
fig.show()

# Save figure
filename = f"crypto-market-trends-{sampling_type}-{len(symbols)}-daily"
path_output = os.path.join(path_output_root, f"{filename}.png")
save_fig(fig, path_output, scale=3)

In [None]:
# Get first n symbols
n = 10000  # 10000 # set some big number to get all
n = 30 # CLT - Central Limit Theorem minimum number of samples
n = min(n, len(dfs_dict)) # ensure n is not bigger than number of symbols
# symbols = list(dfs_dict.keys())[:n]  # first n
# symbols = random.sample(list(dfs_dict.keys()), n) # random n
sampling_type = "random"  # "random" or "first"
if sampling_type == "random":
	symbols = random.sample(list(dfs_dict.keys()), n)  # random n
elif sampling_type == "first":
	symbols = list(dfs_dict.keys())[:n]  # first n
else:
	raise ValueError("Invalid sampling_type")
print(f"Selected {len(symbols)} random symbols:")
for i, symbol in enumerate(symbols):
	print(f"{symbol.replace('-USD', '')}, ", end="")
	if i > 10:
		print("")
		print("...")
		break
print("")
options = {
 	"w": 1280,
 	"h": 720,
#  "traces": ["candlestick"],
 	"traces": [],
	"color_changes": False,
 	"margin": {"r": 0, "t": 60, "b": 0, "l": 0},
 	"labels": ["Index values", "Volume", "MACD"],
}
# fig = get_grouped_fig(dfs_dict.copy(), symbols, f"Market trends for {sampling_type} {n}/{len(dfs_dict)} cryptocurrencies | 2019-01-01 = 100", events, options)
fig = get_grouped_fig(dfs_dict.copy(), symbols, f"Market trend for {sampling_type} {n} cryptocurrencies | 2019-01-01 = 100", events, options)
fig.show()

# Save figure
filename = f"crypto-market-trends-{sampling_type}-{len(symbols)}-daily"
path_output = os.path.join(path_output_root, f"{filename}.png")
save_fig(fig, path_output, scale=3)