In [1]:
import pandas as pd
from pandas_datareader import data as pdr
import yfinance as yf
yf.pdr_override()

In [2]:
import scipy.stats as sst
import numpy as np

In [3]:
#culumative distribution function, returns probability
#probability of everything to the left of 1.2 in a normal distribution
#to get everything to the right, compute (1 - cdf)
sst.norm.cdf(1.2)

0.8849303297782918

In [4]:
#perfent point function, returns critical value
#returns outcome, when given a probability (inverse of cdf)
sst.norm.ppf(0.975)

1.959963984540054

In [5]:
sst.t.cdf(1.2, df=10)

0.8711018496378471

In [6]:
tickers = ['MSFT', 'AAPL']

sec_data=pd.DataFrame()

for t in tickers:
        sec_data[t]=pdr.get_data_yahoo(t, start='1997-1-1')['Adj Close']

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [7]:
sec_data.head()

Unnamed: 0_level_0,MSFT,AAPL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1997-01-02,6.365713,0.159585
1997-01-03,6.599675,0.165284
1997-01-06,6.580177,0.135837
1997-01-07,6.628918,0.132987
1997-01-08,6.502192,0.133937


In [8]:
sec_returns=(sec_data/sec_data.shift(1))-1
sec_returns

Unnamed: 0_level_0,MSFT,AAPL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1997-01-02,,
1997-01-03,0.036753,0.035712
1997-01-06,-0.002954,-0.178160
1997-01-07,0.007407,-0.020978
1997-01-08,-0.019117,0.007142
...,...,...
2023-02-17,-0.015602,-0.007547
2023-02-21,-0.020887,-0.026680
2023-02-22,-0.004591,0.002896
2023-02-23,0.012962,0.003291


In [9]:
sec_returns=sec_returns.dropna()
sec_returns

Unnamed: 0_level_0,MSFT,AAPL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1997-01-03,0.036753,0.035712
1997-01-06,-0.002954,-0.178160
1997-01-07,0.007407,-0.020978
1997-01-08,-0.019117,0.007142
1997-01-09,-0.011994,0.007092
...,...,...
2023-02-17,-0.015602,-0.007547
2023-02-21,-0.020887,-0.026680
2023-02-22,-0.004591,0.002896
2023-02-23,0.012962,0.003291


In [10]:
sec_returns.mean()

MSFT    0.000754
AAPL    0.001413
dtype: float64

In [11]:
sec_returns.std()

MSFT    0.019859
AAPL    0.027066
dtype: float64

In [12]:
#for large samples, t-test and z-test give same results, if sample < 30, t-test is better
sst.ttest_1samp(a=sec_returns['MSFT'], popmean=0)

Ttest_1sampResult(statistic=3.080308605597323, pvalue=0.0020763961470224653)

5% confidence interval, CV=1.96
test statistic = 3.08
test statistic > CV, reject null hypothesis that Microsoft average daily return is 0. 
if pvalue is less than 5%, also reject the null hypothesis

In [13]:
sst.ttest_1samp(a=sec_returns['AAPL'], popmean=0.000754)

Ttest_1sampResult(statistic=1.9740440383834499, pvalue=0.04841850246328851)

In [14]:
#Null hypothesis: MSFT return is the same as AAPL return
sst.ttest_ind(a=sec_returns['MSFT'],b=sec_returns['AAPL'],equal_var=False)

Ttest_indResult(statistic=-1.5912539880189944, pvalue=0.11157861310464375)

In [15]:
sst.ttest_rel(a=sec_returns['MSFT'],b=sec_returns['AAPL'])

Ttest_relResult(statistic=-2.094420189514634, pvalue=0.03626072586912278)

In [16]:
#testing a single variance
hyp_std=0.02 #std under the null, denominator

In [17]:
df_MSFT=sec_returns['MSFT'].count()-1

In [18]:
MSFT_std=sec_returns['MSFT'].std()

In [19]:
chi_squared_stat=df_MSFT*MSFT_std**2/hyp_std**2
chi_squared_stat

6486.845304224805

In [20]:
critical_value=sst.chi2.ppf(q=0.95, df=df_MSFT)
critical_value

6768.808573865782

chi_squared_stat < critical_value, accept the null hypothesis
MSFT variance is less than 0.02^2

What is the null we're accepting?

In [21]:
#Finding the higher variance ratio
F_stat=max(sec_returns['MSFT'].var()/sec_returns['AAPL'].var(), 
          sec_returns['AAPL'].var()/sec_returns['MSFT'].var())
F_stat

1.8574819380088639

In [22]:
CV = sst.f.ppf(0.95, dfn=sec_returns['MSFT'].count()-1, dfd=sec_returns['AAPL'].count()-1)
CV

1.0413948369714034

In [23]:
#Pearson correlation test
#implies correlation=0

In [24]:
sst.pearsonr(sec_returns['MSFT'],sec_returns['AAPL'])
#(correlation coefficient(0-1), p-value from hypothesis test)

PearsonRResult(statistic=0.44319205697837205, pvalue=9.732206375e-315)

In [25]:
#p-value is smaller than 5% and cv<f-stat, reject the null hypothesis
#Correlation is not 0

In [26]:
sst.spearmanr(sec_returns)

SpearmanrResult(correlation=0.48068938501799785, pvalue=0.0)

In [27]:
#Spearman uses a ranking order rather than numerical order, not influenced by outliers.
#Check if results are driven by crisis periods, or outliers