In [None]:
import os
import json
import pandas as pd
import numpy as np
import mplfinance as mpf
import matplotlib as mpl
# import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime as dt
from typing import Union

In [None]:
csv_path_root = 'data/scraped/yahoo/stocks'
index_path = os.path.join('src/py/scraping/yahoo', 'index.json') 
script_path = os.path.join(*[os.path.abspath(""), "src", "py", "scraping", "yahoo"])

In [None]:
# Load data (single)
def get_df(csv_path: str) -> pd.DataFrame:
	df = pd.read_csv(csv_path, index_col=0, parse_dates=True)
	return df

symbol = 'NVDA'
csv_path = os.path.join(csv_path_root, f"{symbol}.csv")
df = get_df(csv_path)
# Print datatype of Date column
print(df.index.dtype)
df.head()

In [None]:
# mpf.plot(df, type='candle', volume=True, style='yahoo')
mpf.plot(df, type='line', volume=True, style='yahoo', mav=(3, 6, 9), show_nontrading=True) # type='candle'

In [None]:
df_last_365 = df.iloc[-365:]

mpf.plot(df_last_365, type='candle', volume=True, style='yahoo', mav=(3, 6, 9), show_nontrading=True) # type='candle'

In [None]:
n_last = 30
df_last_n = df.iloc[-n_last:]
mpf.plot(df_last_n, type='candle', volume=True, style='yahoo', mav=(3, 6, 9), show_nontrading=True) # type='candle'


In [None]:
def compute_rsi(prices: pd.Series, n: int = 14) -> pd.Series:
	"""
	Computes the Relative Strength Index (RSI) for given prices.
	"""
	assert n > 1
	deltas = prices.diff()
	seed = deltas[:n+1]
	up = seed[seed >= 0].sum() / n
	down = -seed[seed < 0].sum() / n
	rs = up / down
	rsi = np.zeros_like(prices)
	rsi[:n] = 100. - 100. / (1. + rs)

	for i in range(n, len(prices)):
		delta = deltas[i]
		if delta > 0:
			upval = delta
			downval = 0.
		else:
			upval = 0.
			downval = -delta

		up = (up * (n - 1) + upval) / n
		down = (down * (n - 1) + downval) / n
		rs = up / down
		rsi[i] = 100. - 100. / (1. + rs)

	return rsi

def add_columns(df: pd.DataFrame) -> pd.DataFrame:
	# # Calculate Bollinger Bands
	window = 30  # Adjust the window size as needed
	sma = df['Close'].rolling(window=window).mean()
	std = df['Close'].rolling(window=window).std()
	upper_band = sma + 2 * std
	lower_band = sma - 2 * std
	# df['SMA'] = sma
	df.loc[:, 'SMA'] = sma
	# df['Upper'] = upper_band
	# df['Lower'] = lower_band
	df.loc[:, 'Upper'] = upper_band
	df.loc[:, 'Lower'] = lower_band
	volume_sma = df['Volume'].rolling(window=window).mean()
	# df['SMA_volume'] = volume_sma
	df.loc[:, 'SMA_volume'] = volume_sma
	# Other stuff
	# df['MA3'] = df['Close'].rolling(3).mean()
	# df['MA6'] = df['Close'].rolling(6).mean()
	# df['MA9'] = df['Close'].rolling(9).mean()
	# df['SMA_50'] = df['Close'].rolling(50).mean()
	# df['SMA_200'] = df['Close'].rolling(200).mean()
	# df['EMA_50'] = df['Close'].ewm(span=50, adjust=False).mean()
	# df['EMA_200'] = df['Close'].ewm(span=200, adjust=False).mean()
	# df['EMA_12'] = df['Close'].ewm(span=12, adjust=False).mean()
	# df['EMA_26'] = df['Close'].ewm(span=26, adjust=False).mean()
	# df['MACD'] = df['EMA_12'] - df['EMA_26']
	# df['SIGNAL'] = df['MACD'].ewm(span=9, adjust=False).mean()
	# df['RSI'] = compute_rsi(df['Close'])
	df.loc[:, 'MA3'] = df['Close'].rolling(3).mean()
	df.loc[:, 'MA6'] = df['Close'].rolling(6).mean()
	df.loc[:, 'MA9'] = df['Close'].rolling(9).mean()
	df.loc[:, 'SMA_50'] = df['Close'].rolling(50).mean()
	df.loc[:, 'SMA_200'] = df['Close'].rolling(200).mean()
	df.loc[:, 'EMA_50'] = df['Close'].ewm(span=50, adjust=False).mean()
	df.loc[:, 'EMA_200'] = df['Close'].ewm(span=200, adjust=False).mean()
	df.loc[:, 'EMA_12'] = df['Close'].ewm(span=12, adjust=False).mean()
	df.loc[:, 'EMA_26'] = df['Close'].ewm(span=26, adjust=False).mean()
	df.loc[:, 'MACD'] = df['EMA_12'] - df['EMA_26']
	df.loc[:, 'SIGNAL'] = df['MACD'].ewm(span=9, adjust=False).mean()
	df.loc[:, 'RSI'] = compute_rsi(df['Close'])

	return df

last_n = 90
df_last_n = df.iloc[-last_n:]
add_columns(df_last_n)
df_last_n.head(15)

In [None]:
# fig, axes = mpf.plot(df_last_n, type='candle', mav=(3, 6, 9), volume=True, style='yahoo', title=symbol, figratio=(16, 9), figscale=1.5, returnfig=True)


# # axes[0].plot(df_last_n['SMA_50'], color='orange', label='SMA_50')
# # axes[0].plot(df_last_n['SMA_200'], color='red', label='SMA_200')
# # axes[0].plot(df_last_n['EMA_50'], color='blue', label='EMA_50')
# # axes[0].plot(df_last_n['EMA_200'], color='green', label='EMA_200')
# # axes[0].legend(loc='upper left')

# # axes[1].plot(df_last_n['RSI'], color='orange', label='RSI')
# # axes[1].legend(loc='upper left')

# # axes[2].plot(df_last_n['MACD'], color='orange', label='MACD')
# # axes[2].plot(df_last_n['SIGNAL'], color='red', label='SIGNAL')

# # axes[2].legend(loc='upper left')

# fig.show()

In [None]:
def custom_figure(df: pd.DataFrame, title: str, currency: str) -> go.Figure:
	# Initialize figure and axes from scratch
	# fig =	px.line(df, x=df.index, y=[], title=f"Stock data for {symbol}")
	fig = make_subplots(rows=2,
											cols=1,
											shared_xaxes=True,
											vertical_spacing=0.01,
											# row_heights=[0.7, 0.3]
											row_heights=[0.8, 0.2],
											)

	# Add Bollinger Bands to the top subplot
	sma_band_trace = go.Scatter(x=df.index,
															y=df["SMA"],
															mode='lines',
															name='SMA',
															line={
																'color': 'rgba(0,0,0,0.5)',
															},
															opacity=0.3
															)
	upper_band_trace = go.Scatter(x=df.index,
																y=df["Upper"],
																mode='lines',
																name='Upper Bollinger Band',
																opacity=0.3,
																line={
																	'dash': 'dash',
																	'color': 'rgba(0,0,0,0.5)',
																}
																)
	lower_band_trace = go.Scatter(x=df.index,
																y=df["Lower"],
																mode='lines',
																name='Lower Bollinger Band',
																opacity=0.3,
																line={
																	'dash': 'dash',
																	'color': 'rgba(0,0,0,0.5)',
																},
																fill='tonexty',
																fillcolor='rgba(0,0,0,0.125)'
																)
	

	# Add traces
	# fig.add_scatter(x=df.index, y=df["Close"], mode='lines', name='Close')
	# fig.add_scatter(x=df.index, y=df["Open"], mode='lines', name='Open')

	# Add candlestick trace
	# fig.add_candlestick(x=df.index, open=df["Open"], high=df["High"], low=df["Low"], close=df["Close"], name='Candlestick')
	candlestick = go.Candlestick(x=df.index,
															open=df["Open"],
															high=df["High"],
															low=df["Low"],
															close=df["Close"],
															name='Candlestick')



	# Add volume trace to subplot
	# volume_bar = go.Bar(x=df.index,
	# 										y=df["Volume"],
	# 										name='Volume',
	# 										marker={
	# 											'color': df['Volume'],
	# 											'colorscale': 'Viridis',
	# 											# 'showscale': True,
	# 											# 'colorbar': {
	# 											# 	'title': 'Volume',
	# 											# 	'x': 1.0,
	# 											# 	'xanchor': 'left'
	# 											# }
	# 										},
	# 										)
	increasing_color = '#428561'
	decreasing_color = '#D53C36'
	colors = [increasing_color if close_price > open_price else decreasing_color for close_price, open_price in zip(df.Close, df.Open)]
	volume_bar = go.Bar(x=df.index,
											y=df["Volume"],
											name='Volume',
											marker={
												'color': colors,
											},
											opacity=1
											)

	# Moving average for volume
	volume_sma_trace = go.Scatter(x=df.index,
																y=df["SMA_volume"],
																mode='lines',
																name='Volume SMA',
																line={
																	'color': 'rgba(0,0,0,0.5)',
																},
																opacity=0.3
																)


	# Add traces to figure
	fig.add_trace(upper_band_trace, row=1, col=1)
	fig.add_trace(lower_band_trace, row=1, col=1)
	fig.add_trace(sma_band_trace, row=1, col=1)
	fig.add_trace(candlestick, row=1, col=1)
	fig.add_trace(volume_bar, row=2, col=1)
	fig.add_trace(volume_sma_trace, row=2, col=1)

	# Somehow bollinger bands still end up on top of everything else...
	# fig.data = fig.data[::-1]
	

	# min_y = min(df["Close"].min(), df["Open"].min())
	# max_y = max(df["Close"].max(), df["Open"].max())
	# date_event = dt(2023, 8, 5)

	# Customize the appearance of the charts
	fig.update_xaxes(title_text='Date', row=2, col=1)
	fig.update_yaxes(title_text=f"Price ({currency})", row=1, col=1)
	fig.update_yaxes(title_text='Volume', row=2, col=1)

	fig.update_layout(
		title_text=title,
		autosize=False,
		width=1280,
		height=720,
	)

	# Display legend
	fig.update_layout(showlegend=True)

	# Enable y-axis range slider
	fig.update_layout(xaxis_rangeslider_visible=False)

	# Set the y-axis scaling reference and scaleratio for the top subplot
	# fig.update_layout(yaxis_type="linear", yaxis_scaleanchor="y1", yaxis_scaleratio=1)


	# Return figure
	return fig

def add_event(fig: go.Figure, event_date: Union[dt, str], event_name: str, line_color: str = "purple", line_width: int = 1) -> go.Figure:
	fig.add_vline(x=event_date, line_width=line_width, line_dash="dashdot", line_color=line_color, row=1, col=1)
	# Add vertical line to legend as dashdot
	fig.add_scatter(x=[None], y=[None], mode='lines', line=dict(color=line_color, width=2, dash="dashdot"), name=event_name)
	return fig

# Add SMA and other columns
add_columns(df)

fig = custom_figure(df, f"Stock data for {symbol}", "USD")

event_covid_lockdown = {
	"event_name": "COVID lockdown (US)",
	"event_date": dt(2020, 3, 19),
	"line_color": "rgba(255,0,255,0.5)"
}
add_event(fig, event_covid_lockdown["event_date"], event_covid_lockdown["event_name"], event_covid_lockdown["line_color"])

# https://news.microsoft.com/2019/07/22/openai-forms-exclusive-computing-partnership-with-microsoft-to-build-new-azure-ai-supercomputing-technologies/
event_microsoft_openai = {
	"event_name": "Microsoft OpenAI exclusive",
	"event_date": dt(2019, 7, 22),
	"line_color": "orange"
}
add_event(fig, event_microsoft_openai["event_date"], event_microsoft_openai["event_name"], event_microsoft_openai["line_color"])

# https://blogs.microsoft.com/blog/2023/01/23/microsoftandopenaiextendpartnership/
event_microsoft_openai_extended = {
	"event_name": "Microsoft OpenAI extended",
	"event_date": dt(2023, 1, 23),
	"line_color": "lime"
}
add_event(fig, event_microsoft_openai_extended["event_date"], event_microsoft_openai_extended["event_name"], event_microsoft_openai_extended["line_color"])

fig.show()
# fig.write_html(os.path.join(script_path, 'visualization-stock-test.html'))
fig.write_image(os.path.join(script_path, 'visualization-stock-test.png'), scale=2)

In [None]:
def get_count_object_template() -> dict:
	return {
		# "Date": None,
		"Count": 0,
		"Open": 0,
		"Close": 0,
		"High": 0,
		"Low": 0,
		"Volume": 0,
	}

def dict_to_df(d: dict) -> pd.DataFrame:
	df = pd.DataFrame.from_dict(d, orient='index')
	df.index.name = 'Date'
	# sort by index
	df.sort_index(inplace=True)
	return df

def remove_incomplete_dates(df: pd.DataFrame, count_total: int) -> pd.DataFrame:
	'''
		Removes dates that are not present in all stocks.
	'''
	df = df[df["Count"] == count_total]
	return df

def remove_sparse_dates_n(df: pd.DataFrame, count_total: int, n: int = 5) -> pd.DataFrame:
	'''
		Removes dates that are present in less than n of the stocks.
	'''
	df = df[df["Count"] >= n]
	return df

def remove_sparse_dates_perc(df: pd.DataFrame, count_total: int, n: int = 0.5) -> pd.DataFrame:
	'''
		Removes dates that are present in less than n% of the stocks.
	'''
	df = df[df["Count"] >= count_total * n]
	return df

def merge_stocks(csv_path_root: str, index_path: str) -> pd.DataFrame:
	'''
		Loads all csv files one by one and merges them into a single dataframe.
	'''
	# different symbols can have different ranges and not all of the days are common even if they are in the same range
	counts_by_date = {} # user for calculating averages
	files = os.listdir(csv_path_root)
	files = [file for file in files if file.endswith('.csv')]
	for i, file in enumerate(files):
		# if i == 10: # for debugging
		# 	break
		symbol = file.split('.')[0]
		print(f"Processing '{symbol}' ({i+1}/{len(files)})")
		csv_path = os.path.join(csv_path_root, file)
		df_new = get_df(csv_path)
		# Get counts by date
		for date, row in df_new.iterrows():
			date_str = date.strftime("%Y-%m-%d")
			if date_str not in counts_by_date:
				counts_by_date[date_str] = get_count_object_template()
			else:
				pass
			# counts_by_date[date_str]["Date"] = date
			counts_by_date[date_str]["Count"] += 1
			counts_by_date[date_str]["Open"] += row["Open"]
			counts_by_date[date_str]["Close"] += row["Close"]
			counts_by_date[date_str]["High"] += row["High"]
			counts_by_date[date_str]["Low"] += row["Low"]
			counts_by_date[date_str]["Volume"] += row["Volume"]
	df = dict_to_df(counts_by_date)
	df = remove_sparse_dates_n(df, len(files), n=5)
	# Remove all rows where Volume is NaN
	df = df[df["Volume"] > 0]
	# Remove all rows where Close is NaN
	df = df[df["Close"] > 0]
	# Remove all rows where Open is NaN
	df = df[df["Open"] > 0]
	# Remove all rows where High is NaN
	df = df[df["High"] > 0]
	# Remove all rows where Low is NaN
	df = df[df["Low"] > 0]
	# Sort by index
	df.sort_index(inplace=True)
	# Calculate averages
	df["Open"] = df["Open"] / df["Count"]
	df["Close"] = df["Close"] / df["Count"]
	df["High"] = df["High"] / df["Count"]
	df["Low"] = df["Low"] / df["Count"]
	df["Volume_average"] = df["Volume"] / df["Count"]
	df["Volume"] = df["Volume_average"] # replace Volume with Volume_average
	# Convert Volume to int
	# df["Volume"] = df["Volume"].astype(int)
	# df["Volume_average"] = df["Volume_average"].astype(int)
	# Convert Date to datetime
	df.index = pd.to_datetime(df.index)
	return df

df_merged = merge_stocks(csv_path_root, index_path)
print(df_merged.head())
# Print info about dataframe
print(df_merged.info())

In [None]:
# print entire dataframe
# pd.set_option('display.max_rows', None)
# print(df_merged)

In [None]:
# Add SMA and other columns
add_columns(df_merged)

# TODO: edit this if you decide to use Index for filtering specific stocks by industry / sector
num_stocks = len([file for file in os.listdir(csv_path_root) if file.endswith('.csv')])

fig = custom_figure(df_merged, f"Averaged stock data for {num_stocks} stocks", "USD")

event_covid_lockdown = {
	"event_name": "COVID lockdown (US)",
	"event_date": dt(2020, 3, 19),
	"line_color": "rgba(255,0,255,0.5)"
}
add_event(fig, event_covid_lockdown["event_date"], event_covid_lockdown["event_name"], event_covid_lockdown["line_color"])

# https://news.microsoft.com/2019/07/22/openai-forms-exclusive-computing-partnership-with-microsoft-to-build-new-azure-ai-supercomputing-technologies/
event_microsoft_openai = {
	"event_name": "Microsoft OpenAI exclusive",
	"event_date": dt(2019, 7, 22),
	"line_color": "orange"
}
add_event(fig, event_microsoft_openai["event_date"], event_microsoft_openai["event_name"], event_microsoft_openai["line_color"])

# https://blogs.microsoft.com/blog/2023/01/23/microsoftandopenaiextendpartnership/
event_microsoft_openai_extended = {
	"event_name": "Microsoft OpenAI extended",
	"event_date": dt(2023, 1, 23),
	"line_color": "lime"
}
add_event(fig, event_microsoft_openai_extended["event_date"], event_microsoft_openai_extended["event_name"], event_microsoft_openai_extended["line_color"])

fig.show()
# fig.write_html(os.path.join(script_path, 'visualization-stock-test.html'))
fig.write_image(os.path.join(script_path, 'visualization-stock-merged.png'), scale=2)