In [519]:
import pandas as pd
import yfinance as yf
from yahoofinancials import YahooFinancials
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook_connected"


In [518]:
def download_data(symbol: str, start_year: str, end_year: str):
    df = yf.download(symbol, 
                        start= start_year + '-01-01', 
                        end= end_year + '-12-31', 
                        progress=False,
    )
    return df

In [520]:
cache = dict()

def relates(section: tuple, range:tuple):
    if section[0] >= range[0] and section[1] <= range[1]:
        return 'contained'
    if section[0] < range[0] and section[1] > range[1]:
        return 'contains'
    if section[0] < range[0] and section[1] > range[0] and section[1] <= range[1]:
        return 'extends_left'
    if section[0] >= range[0] and section[0] < range[1] and section[1] > range[1]:
        return 'extends_right'
    else:
        return 'differ'

def from_cache(symbol, start, end):
    start = int(start); end = int(end)
    cached, range_start, range_end = cache.get(symbol, (None, None, None))
    if cached is not None and relates((start, end), (range_start, range_end)) == 'contained':
        return cached
    return None

def save_to_cache(data, symbol, start, end):
    start = int(start); end = int(end)
    cached, range_start, range_end = cache.get(symbol, (None, None, None))
    if cached is None:
        cache[symbol] = (data, start, end)
    else:
        relation = relates((start, end), (range_start, range_end))
        if relation == 'contains':
            cache[symbol] = (data, start, end)  # keep in cache the bigger section
        if relation == 'extends_left':
            data.merge(cached, how='outer', on=[data.index.name] + list(data.columns))
        if relation == 'extends_right':
            cached.merge(data, how='outer', on=[data.index.name] + list(data.columns))
        
def get_data(symbol, start, end):
    data = from_cache(symbol, start, end)
    if data is None:
        data = download_data(symbol, start, end)
    save_to_cache(data, symbol, start, end)
    return data

def clear_cache():
    cache = dict()

In [None]:
# Cache tests
d1 = get_data('^GSPC', '2001', '2020')
clear_cache()
get_data('^GSPC', '2007', '2011')
get_data('^GSPC', '2011', '2015')
get_data('^GSPC', '2014', '2020')
get_data('^GSPC', '2001', '2015')
d2 = get_data('^GSPC', '2001', '2020')
assert d1.equals(d2)

In [532]:
def get_close_prices_of(prices, month, year):
    return prices.loc[str(year)+'-'+str(month)]['Close'] 

def deviation_of(data, month, year):
    '''  Return a new dataframe with the distance between the daily price 
         to the monthly mean ('deviation') and normaled value of it'''
    close_prices = get_close_prices_of(data, month, year)
    close_prices = pd.DataFrame(close_prices)
    mean = close_prices.mean()
    dist = (close_prices - mean)
    close_prices['deviation'] = dist
    close_prices['deviation_norm'] = dist/mean * 100
    close_prices.attrs['month'] = month
    close_prices.attrs['year'] = year
    close_prices.index = close_prices.index.map(lambda date: date.day)
    close_prices.index.name = 'day'
    return close_prices

In [454]:
# Create deviations DF over all years
def deviation_over_period(prices, start_year, end_year) -> pd.DataFrame:
    start_year = int(start_year); end_year=int(end_year)
    dev_df = pd.DataFrame(index=pd.Index(range(1,32),name='day'))
    for year in range(start_year, end_year+1):
        for month in range(1,13):
            month_dev = deviation_of(prices, month, year).deviation_norm
            title = str(month_dev.attrs['month']) + '-' + str(month_dev.attrs['year'])
            month_dev.name = title
            month_dev = pd.DataFrame(month_dev)
            dev_df = pd.concat([dev_df, month_dev], axis=1)    
    return dev_df

In [455]:
import numpy as np

SQUARE_ERROR = False

def add_statistics(devs: pd.DataFrame, square_error=SQUARE_ERROR):
    prices_len = len(devs.columns)

    global SQUARE_ERROR 
    SQUARE_ERROR = square_error # set the default according to last call - dirty but better this way for the flow

    # FOR DEBUGGING comment out
    # prices_len = 3## for debugging len(devs.columns)
    # devs = devs.iloc[:,:prices_len] # for debugging on smaller df

    # Add statistics
    prices = devs.iloc[:,:prices_len]
    devs['mean'] = prices.mean(axis=1)
    devs['variance'] = np.nanvar(prices, axis=1)
    devs['std_dev'] = prices.std(axis=1, ddof=0, numeric_only=True)

    # This section is to be able to manipulate a bit the statistics
    devs['my_mean'] = prices.apply(axis=1, 
                                func=lambda row: row.sum() / row.count())
    power = 2 if square_error else 1
    devs['sum_dev_sqr'] = prices.apply(
        lambda row, means:
            sum(row.dropna().apply(lambda x: (x - means[row.name])**power)),
        axis=1, result_type='reduce', means=devs['mean'])
    devs['my_variance'] = devs['sum_dev_sqr'] / prices.count(axis=1)
    devs['my_std_dev'] = np.sqrt(abs(devs['my_variance']))
    return devs

In [458]:
def plot(deviations: pd.DataFrame, symbol, start_year, end_year):
    deviations.index.name = 'day' #  the concat prob destroyed the name of the index
    fig = px.scatter(deviations, x=deviations.index, y=deviations['mean'], title=f'{symbol} Mean price per day of month between {start_year}-{end_year}')
    fig.show()
    fig = px.scatter(deviations, x=deviations.index, y=deviations['mean'], error_y=deviations.std_dev, title=f'{symbol} Mean price per day of month + std dev between {start_year}-{end_year}')
    fig.show()
    if not SQUARE_ERROR:
        fig = px.scatter(deviations, x=deviations.index, y=deviations['mean'], error_y=deviations.my_std_dev, title=f'{symbol} Mean price per day of month + non mean squared error between {start_year}-{end_year}')
        fig.show()


In [521]:

def full_run(symbol, start, end, add_non_squared_error=False):
    prices = get_data(symbol, start, end)
    devs = deviation_over_period(prices, start, end)
    more_statistics = add_statistics(devs, not add_non_squared_error)
    plot(more_statistics, symbol, start, end)


In [533]:
full_run('^GSPC', '2000', '2021')