<a href="https://colab.research.google.com/github/SamanvayMS/FIN-554-Algo-trading-finalproject/blob/main/VWAP_hypothesis_tests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install yfinance

In [None]:
pip install update pandas_datareader

In [92]:
import pandas as pd
import yfinance as yf
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [93]:
yf.pdr_override()

In [94]:
start='2000-01-01'
end='2010-01-01'
df = yf.download('SPY',start,end)

[*********************100%***********************]  1 of 1 completed


In [95]:
df.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [96]:
def data_prep_VWAP(symbol,lookback,start,end):
  # Calculate the cumulative sum of price times volume
  df = yf.download(symbol,start,end)
  df['PV'] = df['Close'] * df['Volume']
  df['CumulativePV'] = df['PV'].rolling(lookback).sum()

  # Calculate the cumulative sum of volume
  df['CumulativeVolume'] = df['Volume'].rolling(lookback).sum()

  # Calculate the rolling VWAP
  df['RollingVWAP'] = df['CumulativePV'] / df['CumulativeVolume']
  df['signal']=np.where(df['Open']>df['RollingVWAP'].shift(1),1,0)
  df['returns']=(df['Close']-df['Open'])/df['Open']
  df['shifted_returns']=df['returns'].shift(-1)
  df['direction'] = [1 if x>0 else 0 for x in df['returns']]
  df=df.dropna(axis=0)
  return df

In [97]:
# here we look at stcoks from all the sectors of the market
sectors = ['Tech','Healthcare','Financial','Consumer_D','Consumer_S','Industrial','Energy','Utility','Market_ETF']
Tech_stocks=['AAPL','MSFT','NVDA','ADBE','CRM','GOOGL','AMZN','IBM','INTC','CSCO']
Healthcare_stocks=['JNJ','PFE','MRK','GILD','AMGN','BMY','LLY','REGN','UNH']
Financial_stocks=["JPM", "BAC", "WFC", "C", "GS", "MS", "V", "MA", "AXP", "SCHW"]
Consumer_D_stocks = ["HD", "MCD", "NKE", "SBUX", "DIS", "CMCSA", "LOW", "GM", "F", "BKNG"] # Consumer discretionary
Consumer_S_stocks= ["PG", "KO", "PEP", "WMT", "CL", "KMB", "CLX", "MDLZ", "EL"] #Consumer Staples
Industrial_stocks = ["BA", "GE", "MMM", "HON", "CAT", "RTX", "LMT", "GD", "UNP", "FDX"]
Energy_stocks = ["XOM", "CVX", "BP", "COP", "EOG", "OXY", "SLB", "KMI", "WMB"]
Utility_stocks = ["NEE", "DUK", "D", "SO", "EXC", "AEP", "SRE", "XEL", "WEC", "AWK"]
Market_ETF_stocks = ['SPY','QQQ']

### Asset selection for tests
Selected a basket of stocks from each sector for running and analysing the stocks and their sector wise dependence.
pros- can identify the sector wise dependence of stocks on indicators 
cons - 
survivorship bias as some stocks might delisted or just added 
overfitting bias as we are cherry picking stocks based on p value 


In [98]:
lookbacks=[9,20,50,100,200]

In [99]:
start='2000-01-01'
end='2015-01-01'

# Hypothesis Tests 


chi_squared test

In [100]:
from scipy.stats import chi2_contingency

H0 :- null hypothesis assumes no significant relationship between opening price and the previous rolling VWAP value

H1 :- significant relationship with the previous VWAP value bullish when above VWAP and bearish when below VWAP

In [101]:
def chi_2(Symbol,lookback,start,end,print_results=False):
  df = data_prep_VWAP(Symbol,lookback,start=start,end=end)
  observed=pd.crosstab(df['direction'],df['signal'])
  result = chi2_contingency(observed)
  if print_results:
    print(f"p value for {Symbol} is {result[1]}")
    if result[1]>0.01:
      print("null hypothesis cannot be rejected")
    else:
      print("null hypothesis is rejected")
  return result[1]

In [None]:
chi_2_hypothesis_rejected=dict.fromkeys(sectors)
for sector in sectors:
  chi_2_hypothesis_rejected[sector]={}
  for stock in eval(sector+'_stocks'):
    chi_2_hypothesis_rejected[sector][stock]={}
    for lookback in lookbacks:
      p_value = chi_2(stock,lookback,start,end)
      if p_value<0.01:
        chi_2_hypothesis_rejected[sector][stock][lookback]=p_value


The stocks from each sector and the p values for the lookback period that was significant


In [103]:
chi_2_hypothesis_rejected

{'Tech': {'AAPL': {},
  'MSFT': {},
  'NVDA': {},
  'ADBE': {},
  'CRM': {},
  'GOOGL': {},
  'AMZN': {},
  'IBM': {20: 0.00022509312052244968, 50: 0.003404876069126242},
  'INTC': {20: 0.00195397880491183},
  'CSCO': {}},
 'Healthcare': {'JNJ': {},
  'PFE': {},
  'MRK': {},
  'GILD': {9: 0.009967699673768692},
  'AMGN': {9: 0.007729930807287963},
  'BMY': {},
  'LLY': {},
  'REGN': {},
  'UNH': {}},
 'Financial': {'JPM': {},
  'BAC': {50: 0.0004545738831357769,
   100: 0.0006559967461908341,
   200: 0.000354632984565625},
  'WFC': {9: 1.1413091044536635e-05, 20: 0.0002469740678344058},
  'C': {},
  'GS': {},
  'MS': {},
  'V': {},
  'MA': {},
  'AXP': {},
  'SCHW': {}},
 'Consumer_D': {'HD': {},
  'MCD': {},
  'NKE': {},
  'SBUX': {9: 0.0031921507465243015},
  'DIS': {},
  'CMCSA': {},
  'LOW': {},
  'GM': {},
  'F': {},
  'BKNG': {100: 0.0078052955394651725, 200: 0.00534061471338179}},
 'Consumer_S': {'PG': {},
  'KO': {200: 0.0019015133954092792},
  'PEP': {},
  'WMT': {},
  'CL': {

Looking at this we dont see too many stocks having clear relationship and the relationship is only visible in certain assets so this hypothesis can be rejected on the whole

wilcox rank test

In [104]:
from scipy.stats import ranksums
from scipy.stats import shapiro
from scipy.stats import ttest_ind

H0:-There is no significant relationship between the returns and the price's position relative to the VWAP.

H1:-The returns are significantly positive when the price is above the VWAP and significantly negative when the price is below the VWAP.

In [105]:
def tests(Symbol,lookback,start,end,print_results=False):
  df = data_prep_VWAP(Symbol,lookback,start=start,end=end)
  above_vwap = df[df['Open'] > df['RollingVWAP'].shift(1)]['returns']
  below_vwap = df[df['Open'] < df['RollingVWAP'].shift(1)]['returns']

  #plt.hist(above_vwap,bins=30,color="green",alpha=0.5)
  #plt.hist(below_vwap,bins=30,color="red",alpha=0.5)
  #plt.show()
  _, p_value1 = shapiro(above_vwap)
  _, p_value2 = shapiro(below_vwap)
  if p_value1 > 0.01 and p_value2 > 0.01:
    if print_results:
      print("returns are normally distributed")
      print("we can run a t-test")
    stat, p_value = ttest_ind(above_vwap, below_vwap, equal_var=True)
  else:
    if print_results:
      print("returns are not normally distributed")
      print("we can run a Wilcox Rank Sum test")    
    stat, p_value = ranksums(above_vwap, below_vwap)
  if print_results:
    print(f"p value for {Symbol} is {p_value}")
    if p_value>0.01:
      print("null hypothesis cannot be rejected")
    else:
      print("null hypothesis is rejected")
  return p_value

In [None]:
WRT_hypothesis_rejected=dict.fromkeys(sectors)
for sector in sectors:
  WRT_hypothesis_rejected[sector]={}
  for stock in eval(sector+'_stocks'):
    WRT_hypothesis_rejected[sector][stock]={}
    for lookback in lookbacks:
      p_value = tests(stock,lookback,start,end)
      if p_value<0.01:
        WRT_hypothesis_rejected[sector][stock][lookback]=p_value

In [107]:
WRT_hypothesis_rejected

{'Tech': {'AAPL': {},
  'MSFT': {},
  'NVDA': {},
  'ADBE': {},
  'CRM': {},
  'GOOGL': {},
  'AMZN': {},
  'IBM': {20: 0.000627479671296681, 50: 0.001604488958014522},
  'INTC': {20: 0.0005059508507632124},
  'CSCO': {}},
 'Healthcare': {'JNJ': {},
  'PFE': {9: 0.005513933052505112},
  'MRK': {},
  'GILD': {9: 0.0041164552045779975},
  'AMGN': {9: 0.0011220472487111985},
  'BMY': {},
  'LLY': {},
  'REGN': {},
  'UNH': {}},
 'Financial': {'JPM': {},
  'BAC': {50: 0.00048498939807548643,
   100: 0.0016343801666512479,
   200: 0.0023914319763511317},
  'WFC': {9: 3.5535242307535772e-06, 20: 0.00018701641559694396},
  'C': {50: 0.0018418161026729648},
  'GS': {50: 0.009359279135939147},
  'MS': {},
  'V': {},
  'MA': {},
  'AXP': {},
  'SCHW': {}},
 'Consumer_D': {'HD': {},
  'MCD': {},
  'NKE': {},
  'SBUX': {9: 0.0010403601726963478},
  'DIS': {},
  'CMCSA': {},
  'LOW': {},
  'GM': {},
  'F': {50: 0.0002775309973352924,
   100: 0.00017668389625593537,
   200: 0.007896240391609532},
  

Looking at this we dont see too many stocks having clear relationship and the relationship is only visible in certain assets such as 'F', 'BP', 'BKNG' so this hypothesis can be rejected on the whole

# Taking moving averages


In [108]:
def data_prep_MA(symbol,lookback,start,end,smoothing_choice="s"):
  # Calculate the cumulative sum of price times volume
  df = yf.download(symbol,start,end)
  # Calculate the rolling MAs
  if smoothing_choice=='s':
    df['SMA']=df['Close'].rolling(window=lookback).mean()
  elif smoothing_choice=='e':
    df['EMA']=df['Close'].ewm(span=lookback, adjust=False).mean()
  df['returns']=(df['Close']-df['Open'])/df['Open']
  df=df.dropna(axis=0)
  return df

In [109]:
data_prep_MA('AAPL',lookback,start,end,'s')

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,SMA,returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-01-03,0.936384,1.004464,0.907924,0.999442,0.850644,535796800,,0.067342
2000-01-04,0.966518,0.987723,0.903460,0.915179,0.778926,512377600,,-0.053117
2000-01-05,0.926339,0.987165,0.919643,0.928571,0.790324,778321600,,0.002410
2000-01-06,0.947545,0.955357,0.848214,0.848214,0.721931,767972800,,-0.104830
2000-01-07,0.861607,0.901786,0.852679,0.888393,0.756127,460734400,,0.031088
...,...,...,...,...,...,...,...,...
2014-12-24,28.145000,28.177500,28.002501,28.002501,25.167875,57918400,23.937804,-0.005063
2014-12-26,28.025000,28.629999,28.002501,28.497499,25.612764,134884000,23.985532,0.016860
2014-12-29,28.447500,28.692499,28.424999,28.477501,25.594790,110395600,24.034225,0.001055
2014-12-30,28.410000,28.480000,28.027500,28.129999,25.282467,119526000,24.080814,-0.009856


In [110]:
def tests_MA(symbol,lookback,start,end,smoothing_choice,print_results=False,plot_dist=False):
  if smoothing_choice=='s':
    column='SMA'
  elif smoothing_choice=='e':
    column='EMA'
  else:
    print("invalid MA choice")
  df = data_prep_MA(symbol,lookback,start,end,smoothing_choice)
  above_ma = df[df['Open'] > df[column].shift(1)]['returns']
  below_ma = df[df['Open'] < df[column].shift(1)]['returns']
  if plot_dist:
    plt.hist(above_ma,bins=30,color="green",alpha=0.5)
    plt.hist(below_ma,bins=30,color="red",alpha=0.5)
    plt.show()
  
  _, p_value1 = shapiro(above_ma)
  _, p_value2 = shapiro(below_ma)
  if p_value1 > 0.01 and p_value2 > 0.01:
    if print_results:
      print("returns are normally distributed")
      print("we can run a t-test")
    stat, p_value = ttest_ind(above_ma, below_ma, equal_var=True)
  else:
    if print_results:
      print("returns are not normally distributed")
      print("we can run a Wilcox Rank Sum test")    
    stat, p_value = ranksums(above_ma, below_ma)
  if print_results:
    print(f"p value for {symbol} is {p_value}")
    if p_value>0.01:
      print("null hypothesis cannot be rejected")
    else:
      print("null hypothesis is rejected")
  return p_value

In [None]:
EMA_hypothesis_rejected=dict.fromkeys(sectors)
for sector in sectors:
  EMA_hypothesis_rejected[sector]={}
  for stock in eval(sector+'_stocks'):
    EMA_hypothesis_rejected[sector][stock]={}
    for lookback in lookbacks:
      EMA_hypothesis_rejected[sector][stock][lookback]={}
      for ma_type in ['s','e']:
        p_value = tests(stock,lookback,start,end)
        if p_value<0.01:
          EMA_hypothesis_rejected[sector][stock][lookback][ma_type]=p_value