In [450]:
import pandas as pd
import yfinance as yf
from yahoofinancials import YahooFinancials
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook_connected"


In [451]:
def download_data(symbol: str, start_year: str, end_year: str):
    df = yf.download(symbol, 
                        start= start_year + '-01-01', 
                        end= end_year + '-12-31', 
                        progress=False,
    )
    return df

In [452]:
def get_close_prices_of(prices, month, year):
    return prices.loc[str(year)+'-'+str(month)]['Close'] 

def deviation_of(data, month, year):
    '''  Return a new dataframe with the distance between the daily price 
         to the monthly mean ('deviation') and normaled value of it'''
    close_prices = get_close_prices_of(data, month, year)
    close_prices = pd.DataFrame(close_prices)
    mean = close_prices.mean()
    dist = (close_prices - mean)
    close_prices['deviation'] = dist
    close_prices['deviation_norm'] = dist/mean
    close_prices.attrs['month'] = month
    close_prices.attrs['year'] = year
    close_prices.index = close_prices.index.map(lambda date: date.day)
    close_prices.index.name = 'day'
    return close_prices

In [454]:
# Create deviations DF over all years
def deviation_over_period(prices, start_year, end_year) -> pd.DataFrame:
    start_year = int(start_year); end_year=int(end_year)
    dev_df = pd.DataFrame(index=pd.Index(range(1,32),name='day'))
    for year in range(start_year, end_year+1):
        for month in range(1,13):
            month_dev = deviation_of(prices, month, year).deviation_norm
            title = str(month_dev.attrs['month']) + '-' + str(month_dev.attrs['year'])
            month_dev.name = title
            month_dev = pd.DataFrame(month_dev)
            dev_df = pd.concat([dev_df, month_dev], axis=1)    
    return dev_df

In [455]:
import numpy as np

SQUARE_ERROR = False

def add_statistics(devs: pd.DataFrame, square_error=SQUARE_ERROR):
    prices_len = len(devs.columns)

    global SQUARE_ERROR 
    SQUARE_ERROR = square_error # set the default according to last call - dirty but better this way for the flow

    # FOR DEBUGGING comment out
    # prices_len = 3## for debugging len(devs.columns)
    # devs = devs.iloc[:,:prices_len] # for debugging on smaller df

    # Add statistics
    prices = devs.iloc[:,:prices_len]
    devs['mean'] = prices.mean(axis=1)
    devs['variance'] = np.nanvar(prices, axis=1)
    devs['std_dev'] = prices.std(axis=1, ddof=0, numeric_only=True)

    # This section is to be able to manipulate a bit the statistics
    devs['my_mean'] = prices.apply(axis=1, 
                                func=lambda row: row.sum() / row.count())
    power = 2 if square_error else 1
    devs['sum_dev_sqr'] = prices.apply(
        lambda row, means:
            sum(row.dropna().apply(lambda x: (x - means[row.name])**power)),
        axis=1, result_type='reduce', means=devs['mean'])
    devs['my_variance'] = devs['sum_dev_sqr'] / prices.count(axis=1)
    devs['my_std_dev'] = np.sqrt(abs(devs['my_variance']))
    return devs

In [456]:
def plot(deviations: pd.DataFrame, symbol, start_year, end_year):
    deviations.index.name = 'day' #  the concat prob destroyed the name of the index
    fig = px.scatter(deviations, x=deviations.index, y=deviations['mean'], title=f'{symbol} Mean price per day of month between {start_year}-{end_year}')
    fig.show()
    fig = px.scatter(deviations, x=deviations.index, y=deviations['mean'], error_y=deviations.std_dev, title=f'{symbol} Mean price per day of month + std dev between {start_year}-{end_year}')
    fig.show()
    if not SQUARE_ERROR:
        fig = px.scatter(deviations, x=deviations.index, y=deviations['mean'], error_y=deviations.my_std_dev, title=f'{symbol} Mean price per day of month + non mean squared error between {start_year}-{end_year}')
        fig.show()


In [457]:
downloads = dict()
def full_run(symbol, start, end, add_non_squared_error=False):
    prices = download_data(symbol, start, end)
    devs = deviation_over_period(prices, start, end)
    more_statistics = add_statistics(devs, not add_non_squared_error)
    plot(more_statistics, symbol, start, end)

full_run('^GSPC', '2000', '2005')
full_run('WIX', '2018', '2020')