In [360]:
import pandas as pd
import yfinance as yf
from yahoofinancials import YahooFinancials
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook_connected"


# ARGUMENTS
SYMBOL = '^GSPC'
START_YEAR = '2000'
END_YEAR = '2021'

In [361]:
df = yf.download(SYMBOL, 
                      start= START_YEAR + '-01-01', 
                      end=END_YEAR + '-12-31', 
                      progress=False,
)
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1999-12-31,1464.469971,1472.420044,1458.189941,1469.25,1469.25,374050000
2000-01-03,1469.25,1478.0,1438.359985,1455.219971,1455.219971,931800000
2000-01-04,1455.219971,1455.219971,1397.430054,1399.420044,1399.420044,1009000000
2000-01-05,1399.420044,1413.27002,1377.680054,1402.109985,1402.109985,1085500000
2000-01-06,1402.109985,1411.900024,1392.099976,1403.449951,1403.449951,1092300000


In [362]:
def get_prices_of(month, year):
    return df.loc[str(year)+'-'+str(month)]['Close'] 

In [363]:
def deviation_of(month, year):
    '''  Return a new dataframe with the distance between the daily price 
         to the monthly mean ('deviation') and normaled value of it'''
    prices = get_prices_of(month, year)
    prices = pd.DataFrame(prices)
    mean = prices.mean()
    dist = (prices - mean)
    prices['deviation'] = dist
    prices['deviation_norm'] = dist/mean
    prices.attrs['month'] = month
    prices.attrs['year'] = year
    prices.index = prices.index.map(lambda date: date.day)
    prices.index.name = 'day'
    return prices

In [366]:
prices = deviation_of(1, 2000)
fig = px.scatter(x=prices.index, y=prices['deviation_norm'], title='Deviation of close prices per day of Jan 2000', labels={'x': 'day', 'y':'deviation'})
fig.show()

In [370]:
# Create deviations DF over all years
def deviation_over_period(start_year=int(START_YEAR), end_year=int(END_YEAR)) -> pd.DataFrame:
    dev_df = pd.DataFrame(index=pd.Index(range(1,32),name='day'))
    for year in range(start_year, end_year+1):
        for month in range(1,13):
            month_dev = deviation_of(month, year).deviation_norm
            title = str(month_dev.attrs['month']) + '-' + str(month_dev.attrs['year'])
            month_dev.name = title
            month_dev = pd.DataFrame(month_dev)
            dev_df = pd.concat([dev_df, month_dev], axis=1)    
    return dev_df

In [371]:
import numpy as np
devs = deviation_over_period()
prices_len = len(devs.columns)

# FOR DEBUGGING comment out
# prices_len = 3## for debugging len(devs.columns)
# devs = devs.iloc[:,:prices_len] # for debugging on smaller df

# Add statistics
prices = devs.iloc[:,:prices_len]
devs['mean'] = prices.mean(axis=1)
devs['variance'] = np.nanvar(prices, axis=1)
devs['std_dev'] = prices.std(axis=1, ddof=0, numeric_only=True)

# This section is to be able to manipulate a bit the statistics
devs['my_mean'] = prices.apply(axis=1, 
                               func=lambda row: row.sum() / row.count())
devs['sum_dev_sqr'] = prices.apply(
    lambda row, means:
        sum(row.dropna().apply(lambda x: (x - means[row.name])**2)),
    axis=1, result_type='reduce', means=devs['mean'])
devs['my_variance'] = devs['sum_dev_sqr'] / prices.count(axis=1)
devs['my_std_dev'] = np.sqrt(abs(devs['variance']))
devs

Unnamed: 0_level_0,1-2000,2-2000,3-2000,4-2000,5-2000,6-2000,7-2000,8-2000,9-2000,10-2000,...,11-2021,12-2021,mean,variance,std_dev,my_mean,sum_dev_sqr,my_variance,my_std_dev,std_dev_from_variance
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,0.014692,-0.043699,,0.035087,-0.008994,,-0.031881,0.035909,,...,-0.011509,-0.033697,-0.003119,0.000844,0.02905,-0.003119,0.142619,0.000844,0.02905,0.02905
2,,0.014577,-0.041917,,0.019606,0.010466,,-0.031477,,0.033152,...,-0.007871,-0.019981,-0.00079,0.000909,0.030154,-0.00079,0.162759,0.000909,0.030154,0.030154
3,0.020788,0.025989,-0.022911,0.03053,-0.002383,,-0.002351,-0.022147,,0.026124,...,-0.001461,-0.028261,0.00022,0.000888,0.029807,0.00022,0.159921,0.000888,0.029807,0.029807
4,-0.018354,0.025557,,0.022838,-0.006281,,,-0.015166,,0.031778,...,0.002715,,-0.000228,0.000759,0.027553,-0.000228,0.12906,0.000759,0.027553,0.027553
5,-0.016467,,,0.017802,0.009976,0.003879,-0.018175,,0.026583,0.033188,...,0.006458,,0.001239,0.000652,0.025536,0.001239,0.118676,0.000652,0.025536,0.025536
6,-0.015527,,-0.035316,0.027361,,-0.002818,-0.011088,,0.016482,0.013557,...,,-0.016861,-6.7e-05,0.000569,0.023846,-6.7e-05,0.104629,0.000569,0.023846,0.023846
7,0.011142,0.025463,-0.060041,0.037633,,0.00643,0.004004,-0.004132,0.023471,,...,,0.003496,-0.001602,0.000361,0.018988,-0.001602,0.067063,0.000361,0.018988,0.018988
8,,0.038049,-0.052359,,0.004012,-0.000198,,-0.001789,0.018014,,...,0.007352,0.006593,-0.000144,0.000367,0.01915,-0.000144,0.068943,0.000367,0.01915,0.01915
9,,0.016442,-0.028098,,-0.004469,-0.003426,,-0.008474,,0.00855,...,0.003827,-0.000636,-0.00201,0.000374,0.019341,-0.00201,0.069949,0.000374,0.019341,0.019341
10,0.022457,0.020128,-0.032688,0.029496,-0.024977,,0.001777,-0.01697,,-0.002247,...,-0.00443,0.008907,-0.001441,0.000337,0.018355,-0.001441,0.062663,0.000337,0.018355,0.018355


In [380]:
devs.index.name = 'day' #  the concat prob destroyed the name of the index
fig = px.scatter(devs, x=devs.index, y=devs['mean'], title='Mean price per day of month')
fig.show()
fig = px.scatter(devs, x=devs.index, y=devs['mean'], error_y=devs.std_dev, title='Mean price per day of month + std dev')
fig.show()
