In [1]:
import yfinance as yf
import pandas as pd

# 1. Define the tickers and date range from the paper
tickers = ['AAPL', 'MSFT', 'GOOG', 'AMZN']
start_date = '2018-01-01'
end_date = '2023-12-31' # yfinance includes the end date

# 2. Download the data
# This downloads all tickers at once into a multi-index DataFrame
try:
    data = yf.download(tickers, start=start_date, end=end_date)
    
    if data.empty:
        raise Exception("No data downloaded. Check tickers, date range, and internet connection.")

    print("Data downloaded successfully. Inspecting columns...")
    # print(data.columns) # Uncomment this for deep debugging

    # 3. We only care about the 'Adj Close' price
    # 'Adj Close' is better than 'Close' as it accounts 
    # for dividends and stock splits.
    
    # When downloading multiple tickers, yfinance returns a MultiIndex
    # The top level is the measure ('Adj Close', 'Close', etc.)
    # The KeyError means 'Adj Close' was not found as a top-level column.
    
    # Check if 'Adj Close' is in the top level of the columns
    if 'Adj Close' not in data.columns.get_level_values(0):
        print("Warning: 'Adj Close' not found in downloaded data columns.")
        print("Available top-level columns:", data.columns.get_level_values(0).unique())
        
        # As a fallback, try to use 'Close'
        if 'Close' in data.columns.get_level_values(0):
            print("Falling back to using 'Close' price.")
            adj_close_data = data['Close']
        else:
            # If neither is present, we can't proceed
            raise Exception("'Adj Close' and 'Close' not found. Cannot proceed.")
    else:
        # This is the normal, expected path
        adj_close_data = data['Adj Close']
    
    # 4. Save to a CSV to use in your project
    adj_close_data.to_csv('stock_prices.csv')
    
    print("\nData processed and saved successfully!")
    print(adj_close_data.head())

except Exception as e:
    print(f"An error occurred: {e}")

  data = yf.download(tickers, start=start_date, end=end_date)
[*********************100%***********************]  4 of 4 completed

Data downloaded successfully. Inspecting columns...
Available top-level columns: Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Price')
Falling back to using 'Close' price.

Data processed and saved successfully!
Ticker           AAPL       AMZN       GOOG       MSFT
Date                                                  
2018-01-02  40.380985  59.450500  52.888073  79.198334
2018-01-03  40.373959  60.209999  53.756138  79.566917
2018-01-04  40.561497  60.479500  53.950798  80.267220
2018-01-05  41.023315  61.457001  54.736919  81.262390
2018-01-08  40.870930  62.343498  54.970818  81.345306





In [None]:
import numpy as np

# Compute log10(a_n / a_{n-1}) for all columns except the first
import numpy as np

# Compute ratios and take log10; this updates only the selected columns
df_returns = adj_close_data - adj_close_data.shift(1)

# Drop the first row which will be NaN due to the shift and reset the index
df_returns = df_returns.dropna().reset_index(drop=True)

# TODO: "Moving Average" over weeks, stride length = 1 day
WINDOW_MA = 1
df_returns_smoothed = df_returns.rolling(window=WINDOW_MA, min_periods=1).mean().dropna()
print(f"\nApplied {WINDOW_MA}-day Moving Average for smoothing.")

# Show the first few transformed rows
# From QCNN_encoding.py:
lowp, highp = np.percentile(df_returns, clip_percentile=(0.5, 99.5))
df_returns = np.clip(df_returns, lowp, highp)
print(df_returns.head())

Ticker      AAPL      AMZN      GOOG      MSFT
0      -0.007027  0.759499  0.868065  0.368584
1       0.187538  0.269501  0.194660  0.700302
2       0.461819  0.977501  0.786121  0.995171
3      -0.152386  0.886497  0.233898  0.082916
4      -0.004681  0.291500 -0.033764 -0.055275


In [None]:
#Save squashed log returns

df_returns.to_csv('finite_difference_returns.csv', index=False)