In [1]:
import yfinance as yf
import pandas as pd

# 1. Define the tickers and date range from the paper
tickers = ['AAPL', 'MSFT', 'GOOG', 'AMZN']
start_date = '2018-01-01'
end_date = '2023-12-31' # yfinance incaludes the end date

# 2. Download the data
# This downloads all tickers at once into a multi-index DataFrame
try:
    data = yf.download(tickers, start=start_date, end=end_date)
    
    if data.empty:
        raise Exception("No data downloaded. Check tickers, date range, and internet connection.")

    print("Data downloaded successfully. Inspecting columns...")
    # print(data.columns) # Uncomment this for deep debugging

    # 3. We only care about the 'Adj Close' price
    # 'Adj Close' is better than 'Close' as it accounts 
    # for dividends and stock splits.
    
    # When downloading multiple tickers, yfinance returns a MultiIndex
    # The top level is the measure ('Adj Close', 'Close', etc.)
    # The KeyError means 'Adj Close' was not found as a top-level column.
    
    # Check if 'Adj Close' is in the top level of the columns
    if 'Adj Close' not in data.columns.get_level_values(0):
        print("Warning: 'Adj Close' not found in downloaded data columns.")
        print("Available top-level columns:", data.columns.get_level_values(0).unique())
        
        # As a fallback, try to use 'Close'
        if 'Close' in data.columns.get_level_values(0):
            print("Falling back to using 'Close' price.")
            df_close = data['Close']
        else:
            # If neither is present, we can't proceed
            raise Exception("'Adj Close' and 'Close' not found. Cannot proceed.")
    else:
        # This is the normal, expected path
        df_close = data['Adj Close']
    
    # 4. Save to a CSV to use in your project
    df_close.to_csv('stock_prices.csv')
    
    print("\nData processed and saved successfully!")
    print(df_close.head())

except Exception as e:
    print(f"An error occurred: {e}")

  data = yf.download(tickers, start=start_date, end=end_date)
[*********************100%***********************]  4 of 4 completed

Data downloaded successfully. Inspecting columns...
Available top-level columns: Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Price')
Falling back to using 'Close' price.

Data processed and saved successfully!
Ticker           AAPL       AMZN       GOOG       MSFT
Date                                                  
2018-01-02  40.341881  59.450500  52.888073  79.198318
2018-01-03  40.334858  60.209999  53.756134  79.566902
2018-01-04  40.522217  60.479500  53.950798  80.267242
2018-01-05  40.983578  61.457001  54.736919  81.262375
2018-01-08  40.831356  62.343498  54.970818  81.345314





In [2]:
#Calculate finite difference

df_diff = df_close - df_close.shift(1)

df_diff = df_diff.dropna().reset_index(drop=True)

print(df_diff.head())

Ticker      AAPL      AMZN      GOOG      MSFT
0      -0.007023  0.759499  0.868061  0.368584
1       0.187359  0.269501  0.194664  0.700340
2       0.461361  0.977501  0.786121  0.995132
3      -0.152222  0.886497  0.233898  0.082939
4      -0.004688  0.291500 -0.033772 -0.055275


In [3]:
# Perform moving average

# TODO: "Moving Average" over each week (window = 5), stride length = 1 day
WINDOW_MA = 5
df_diff_smoothed = df_diff.rolling(window=WINDOW_MA).mean().dropna()
print(f"\nApplied {WINDOW_MA}-day Moving Average for smoothing.")

print(df_diff_smoothed.head())

# Save to CSV
df_diff_smoothed.to_csv('smoothed_stock_differences.csv')


Applied 5-day Moving Average for smoothing.
Ticker      AAPL    AMZN      GOOG      MSFT
4       0.096957  0.6369  0.409795  0.418344
5       0.096486  0.5013  0.199932  0.270903
6       0.105386  0.6709  0.189901  0.178755
7       0.097893  0.7606  0.198939  0.259845
8       0.086182  0.5799  0.147193  0.012900


In [7]:
# convert to binary (which is what we will be using as our input) and check if it is predictive with ACF/PACF

df_diff_binary = (df_diff_smoothed > 0).astype(int)

print(df_diff_binary.head(15))

# Check how many are 1 vs. 0
for col in df_diff_binary.columns:
    print(df_diff_binary[col].value_counts(normalize=True))

df_diff_binary.to_csv('binary_stock_differences.csv', index=False)

Ticker  AAPL  AMZN  GOOG  MSFT
4          1     1     1     1
5          1     1     1     1
6          1     1     1     1
7          1     1     1     1
8          1     1     1     1
9          1     1     1     1
10         1     1     1     1
11         1     1     1     1
12         0     1     1     1
13         1     1     1     1
14         0     1     1     1
15         0     1     1     1
16         0     1     1     1
17         0     1     1     1
18         0     1     0     1
AAPL
1    0.59375
0    0.40625
Name: proportion, dtype: float64
AMZN
1    0.550532
0    0.449468
Name: proportion, dtype: float64
GOOG
1    0.573138
0    0.426862
Name: proportion, dtype: float64
MSFT
1    0.598404
0    0.401596
Name: proportion, dtype: float64
