### NOTE

Check sectors_industries.ipynb for a more up-to-date and optimized version of this notebook.

### TODO

- only load and then process before plotting aggregate data (change, bollinger bands, etc.)
- when loading data, keep data-30 days or so (for bollinger bands) then cut when plotting
- filter out companies with less than X% days of data (or all companies with first day less than 2019-01-01)
- count number of companies used to construct an aggregate
- mean for open, high, low, close, adjusted close ; sum for volume ; similar for others - only then calculate bollinger bands and other stuff before plotting
- create utility functions for processing and plotting so they can be used with crypto data as well (same format)
- exclude industries with less than X companies
- port plotting logic from visualization.ipynb (from src/py/scraping/yahoo/visualization.ipynb)

In [None]:
import os
import json
import time
import pandas as pd
from plotly import subplots
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta
from typing import Tuple

In [None]:
path_stocks_csv_root = "data/scraped/yahoo/stocks/csv"
path_companies_index = "data/scraped/yahoo/sectors/index_stocks.json"
path_output_root = "data/analysis/yahoo/stocks"

if os.path.exists(path_output_root) is False:
	os.makedirs(path_output_root)

In [None]:
def load_stock_csv(ticker: str) -> pd.DataFrame:
	path = os.path.join(path_stocks_csv_root, ticker + ".csv")
	df = pd.read_csv(path)
	return df

def process_df(df: pd.DataFrame, options: dict) -> pd.DataFrame:
	# Convert Date to datetime and set as index
	df["Date"] = pd.to_datetime(df["Date"])
	df = df.set_index("Date")
	# Set Open, High, Low, Close, Adj Close to float and Volume to int
	df[["Open", "High", "Low", "Close", "Adj Close"]] = df[["Open", "High", "Low", "Close", "Adj Close"]].astype(float)
	df["Volume"] = df["Volume"].astype(int)
	# Add column for daily change in price
	df["Change"] = df["Close"] - df["Open"]
	# Add column for daily percent change in price
	df["Change %"] = df["Change"] / df["Open"] * 100
	# Add column for change from previous day
	df["Change d-1"] = df["Close"].diff()
	# Add column for percent change from previous day
	df["Change % d-1"] = df["Change d-1"] / df["Close"].shift(1) * 100
	# Filter by date
	if "date_start" in options:
		df = df[df.index >= options["date_start"]]
	if "date_end" in options:
		df = df[df.index <= options["date_end"]]
	return df

df_msft = load_stock_csv("MSFT")
df_msft = process_df(df_msft, {"date_start": "2020-05-01", "date_end": "2020-05-05"}) 
df_msft.head(10)
# print(df_msft.dtypes)

In [None]:
df_msft = process_df(load_stock_csv("MSFT"), {"date_start": "2019-01-01"})
df_msft.head()

In [None]:
# Plot % change from previous day



# # Plot close price
# fig = px.line(df_msft, x=df_msft.index, y="Close", title="MSFT: Close Price")

# # With bars - positive change in green, negative change in red
# fig = px.bar(df_msft, x=df_msft.index, y="Change % d-1", title="MSFT: Daily Percent Change from Previous Day")
# fig.update_traces(marker_color=["green" if x >= 0 else "red" for x in df_msft["Change % d-1"]])

# Use graph objects to create two subfigures with shared x-axis
fig = subplots.make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.02)

# Add traces
fig.add_trace(go.Scatter(x=df_msft.index, y=df_msft["Close"], name="Close"), row=1, col=1)

fig.add_trace(go.Bar(x=df_msft.index, y=df_msft["Change % d-1"], name="Daily Percent Change from Previous Day"), row=2, col=1)
fig.update_traces(marker_color=["green" if x >= 0 else "red" for x in df_msft["Change % d-1"]])

# Add a vertical line at the start of the pandemic lockdowns in the US and an annotation
d = datetime(2020, 1, 30)
y_shift = 140
fig.add_vline(x=d, line_width=1, line_dash="dash", line_color="black")
fig.add_annotation(x=d, y=0.5, text="WHO Covid", showarrow=False, yshift=y_shift, font=dict(size=10))
# NOTE: use "A1" or sth like that if too much text and then explain in legend or thesis text

# Add figure title
fig.update_layout(title_text="MSFT: Close Price and Daily Percent Change from Previous Day")

# Set x-axis title (only under the bottom subplot)
fig.update_xaxes(title_text="Date", row=2, col=1)

# Set y-axes titles
fig.update_yaxes(title_text="Close Price (USD)", row=1, col=1)
fig.update_yaxes(title_text="Daily % change d-1", row=2, col=1)


fig.show()

In [None]:
companies = json.load(open(path_companies_index, "r"))
companies = {company["symbol"]: company for company in companies}
companies = {k: v for k, v in companies.items() if "profile" in v} # remove companies without profile
print(f"Loaded {len(companies)} companies.")

In [None]:
def get_dfs(companies: dict, options: dict) -> Tuple[dict, list]:
	dfs = {}
	fails = []
	for i, (ticker, company) in enumerate(companies.items()):
		print(f"Loading {ticker} ({i+1}/{len(companies)})", end="\r")
		try:
			df = load_stock_csv(ticker)
			cols1 = df.columns.tolist()
			df = process_df(df, options)
			cols2 = df.columns.tolist()
			dfs[ticker] = df
		except:
			fails.append(ticker)
	print("")
	return dfs, fails

time_start = time.time()
dfs, dfs_fails = get_dfs(companies, {"date_start": "2019-01-01"})
time_end = time.time()
print(f"Loaded {len(dfs)} dataframes in {time_end - time_start:.2f} seconds.")
print(f"Failed to load {len(dfs_fails)} dataframes...")


In [None]:
print(f"Failed symbols:")
for ticker in dfs_fails:
	company_name = companies[ticker]["name"]
	print(f"{ticker}  ({company_name})")

In [None]:
# Add column Symbol, Sector, Industry to start of each dataframe and concatenate them into one big dataframe
for symbol, df in dfs.items():
	if "Symbol" in df.columns and df.columns.tolist()[0] == "Symbol": # ensure idempotence, manually reload if needed
		continue
	company = companies[symbol]
	df["Symbol"] = symbol
	df["Sector"] = company["profile"]["sector"]
	df["Industry"] = company["profile"]["industry"]
	# Put Symbol, Sector, Industry columns at the start
	cols = df.columns.tolist()
	cols = cols[-3:] + cols[:-3]
	df = df[cols]
	dfs[symbol] = df
	# break
# dfs["MSFT"].head()

df_all = pd.concat(dfs.values())
df_all.head()

In [None]:
# Print number of rows and columns
print(f"Number of rows: {df_all.shape[0]}")
print(f"Number of columns: {df_all.shape[1]}")
# Print size in memory
print(f"Size in memory: {df_all.memory_usage().sum() / 1024**2:.2f} MB")

In [None]:
def get_dfs_grouped_sectors(df: pd.DataFrame) -> dict["str", pd.DataFrame]:
	'''
		Returns a dict of dataframes grouped by sector.
	'''
	# Get list of sectors (unique values in Sector column)
	sectors = df["Sector"].unique().tolist()
	# Calculate a dict of dataframes grouped by sector using Date column to calculate the mean for each date
	dfs_grouped_sectors = {}
	for sector in sectors:
		# only group columns after Symbol, Sector, Industry (first 3 columns)
		df_filtered = df[df["Sector"] == sector].iloc[:, 3:]
		grouped = df_filtered.groupby("Date").mean()
		# grouped = df[df["Sector"] == sector].groupby("Date").mean()
		dfs_grouped_sectors[sector] = grouped
	return dfs_grouped_sectors

dfs_grouped_sectors = get_dfs_grouped_sectors(df_all)
df_sector_technology = dfs_grouped_sectors["Technology"]
df_sector_technology.head()

In [None]:
# Check if all industries are unique across sectors
industry_sector = {} # industry: sector mapping
unique_sectors = df_all["Sector"].unique().tolist()
for sector in unique_sectors:
	unique_industries = df_all[df_all["Sector"] == sector]["Industry"].unique().tolist()
	for industry in unique_industries:
		if industry in industry_sector:
			print(f"Industry {industry} is in multiple sectors: {industry_sector[industry]} and {sector}")
		else:
			industry_sector[industry] = sector

# Alternatively could use sectors index

print(f"Number of sectors: {len(unique_sectors)}")
for sector in unique_sectors:
	print(f"  {sector}")
print("")
print(f"Number of industries: {len(industry_sector)}")
for industry in industry_sector:
	print(f"  {industry} ({industry_sector[industry]})")

In [None]:
def plot_df(df: pd.DataFrame, title: str, yaxis_title: str, yaxis2_title: str, yaxis2_col: str, yaxis2_color: str, yaxis2_showgrid: bool, yaxis2_range: list[float]) -> go.Figure:
	'''
		Plot a dataframe with two y-axes.
	'''
	# Create figure with secondary y-axis
	fig = subplots.make_subplots(specs=[[{"secondary_y": True}]])
	# Add traces
	fig.add_trace(go.Scatter(x=df.index, y=df["Close"], name="Close"), secondary_y=False)
	# fig.add_trace(go.Scatter(x=df.index, y=df[yaxis2_col], name=yaxis2_title, line_color=yaxis2_color), secondary_y=True)
	# make it bars instead of line and color them with green where positive and red where negative
	fig.add_trace(go.Bar(x=df.index, y=df[yaxis2_col], name=yaxis2_title), secondary_y=True)
	fig.update_traces(marker_color=["green" if x >= 0 else "red" for x in df[yaxis2_col]], secondary_y=True)
	
	# # draw a gray line plot connecting all consecutive positive bar values and fill the blanks with the last value if there isn't a consecutive positive value
	# positive_values = df[yaxis2_col].copy()
	# positive_values[positive_values < 0] = None
	# positive_values = positive_values.fillna(method="ffill")
	# fig.add_trace(go.Scatter(x=df.index, y=positive_values, name=f"{yaxis2_title} positive hull", line_color="gray"), secondary_y=True)
	# # draw a gray line plot connecting all consecutive negative bar values and fill the blanks with the last value if there isn't a consecutive negative value
	# negative_values = df[yaxis2_col].copy()
	# negative_values[negative_values > 0] = None
	# negative_values = negative_values.fillna(method="ffill")
	# fig.add_trace(go.Scatter(x=df.index, y=negative_values, name=f"{yaxis2_title} negative hull", line_color="gray"), secondary_y=True)
	# Add a vertical line at the start of the pandemic lockdowns in the US and an annotation
	
	d = datetime(2020, 1, 30) # WHO declares Covid-19 a public health emergency of international concern
	# d = datetime(2020, 3, 11) # WHO declares Covid-19 a pandemic
	y_shift = 280
	fig.add_vline(x=d, line_width=1, line_dash="dash", line_color="black")
	fig.add_annotation(x=d, y=0.5, text="WHO Covid", showarrow=False, yshift=y_shift, font=dict(size=10))
	# NOTE: use "A1" or sth like that if too much text and then explain in legend or thesis text
	# Add figure title
	fig.update_layout(title_text=title)
	# Set x-axis title
	fig.update_xaxes(title_text="Date")
	# Set y-axes titles
	fig.update_yaxes(title_text="Close Price (USD)", secondary_y=False)
	fig.update_yaxes(title_text=yaxis2_title, secondary_y=True, showgrid=yaxis2_showgrid, range=yaxis2_range)
	# Show figure
	# fig.show()
	return fig

plot_df(df_sector_technology, "Technology Sector: Close Price and Daily Percent Change from Previous Day", "Close Price (USD)", "Daily % change d-1", "Change % d-1", "red", True, [-10, 10])


In [None]:
def get_dfs_grouped_industries(df: pd.DataFrame) -> dict["str", pd.DataFrame]:
	'''
		Returns a dict of dataframes grouped by industry.
	'''
	# Get list of industries (unique values in Industry column)
	industries = df["Industry"].unique().tolist()
	# Calculate a dict of dataframes grouped by industry using Date column to calculate the mean for each date
	dfs_grouped_industries = {}
	for industry in industries:
		# only group columns after Symbol, Sector, Industry (first 3 columns)
		df_filtered = df[df["Industry"] == industry].iloc[:, 3:]
		grouped = df_filtered.groupby("Date").mean()
		# grouped = df[df["Industry"] == industry].groupby("Date").mean()
		dfs_grouped_industries[industry] = grouped
	return dfs_grouped_industries

dfs_grouped_industries = get_dfs_grouped_industries(df_all)
df_industry_software = dfs_grouped_industries["Software—Application"]
df_industry_software.head()

In [None]:
plot_df(df_industry_software, "Software—Application Industry: Close Price and Daily Percent Change from Previous Day", "Close Price (USD)", "Daily % change d-1", "Change % d-1", "red", True, [-10, 10])

In [None]:
def get_df_grouped_all(df: pd.DataFrame) -> pd.DataFrame:
	'''
		Returns a dataframe with all stocks grouped by date.
	'''
	# Calculate a dataframe grouped by date using Date column to calculate the mean for each date
	df_filtered = df.iloc[:, 3:]
	df_grouped_all = df_filtered.groupby("Date").mean()
	return df_grouped_all

df_grouped_all = get_df_grouped_all(df_all)

plot_df(df_grouped_all, "All Stocks: Close Price and Daily Percent Change from Previous Day", "Close Price (USD)", "Daily % change d-1", "Change % d-1", "red", True, [-10, 10])

In [None]:
def save_fig(fig: go.Figure, name: str) -> None:
	'''
		Save a figure as a png file.
	'''
	name = name.replace(" ", "-").replace("/", "-").replace("'", "-").replace(",", "-").replace("(", "-").replace(")", "-")
	path = os.path.join(path_output_root, name + ".png")
	fig.write_image(path)



In [None]:
# Create and save figures for all sectors

for sector, df in dfs_grouped_sectors.items():
	print(f"Creating figure for sector {sector}...", end="\r", flush=True)
	# fig = plot_df(df, f"{sector} Sector: Close Price and Daily Percent Change from Previous Day", "Close Price (USD)", "Daily % change d-1", "Change % d-1", "red", True, [-10, 10])
	# save_fig(fig, f"sector-{sector}")
	save_fig(plot_df(df, f"{sector} Sector: Close Price and Daily Percent Change from Previous Day", "Close Price (USD)", "Daily % change d-1", "Change % d-1", "red", True, [-10, 10]), f"sector-{sector}")

# Create and save figures for all industries
for industry, df in dfs_grouped_industries.items():
	print(f"Creating figure for industry {industry}...", end="\r", flush=True)
	fig = plot_df(df, f"{industry} Industry: Close Price and Daily Percent Change from Previous Day", "Close Price (USD)", "Daily % change d-1", "Change % d-1", "red", True, [-10, 10])
	save_fig(fig, f"sector-{industry_sector[industry]}-industry-{industry}")

# # Create and save figure for all stocks
print(f"Creating figure for all stocks...", flush=True)
fig = plot_df(df_grouped_all, "All Stocks: Close Price and Daily Percent Change from Previous Day", "Close Price (USD)", "Daily % change d-1", "Change % d-1", "red", True, [-10, 10])
save_fig(fig, "all-stocks")

print("")
print("Done.")