In [1]:
import yfinance as yf
import pandas as pd

# 1. Define the tickers and date range from the paper
tickers = ['AAPL', 'MSFT', 'GOOG', 'AMZN']
start_date = '2018-01-01'
end_date = '2023-12-31' # yfinance includes the end date

# 2. Download the data
# This downloads all tickers at once into a multi-index DataFrame
try:
    data = yf.download(tickers, start=start_date, end=end_date)
    
    if data.empty:
        raise Exception("No data downloaded. Check tickers, date range, and internet connection.")

    print("Data downloaded successfully. Inspecting columns...")
    # print(data.columns) # Uncomment this for deep debugging

    # 3. We only care about the 'Adj Close' price
    # 'Adj Close' is better than 'Close' as it accounts 
    # for dividends and stock splits.
    
    # When downloading multiple tickers, yfinance returns a MultiIndex
    # The top level is the measure ('Adj Close', 'Close', etc.)
    # The KeyError means 'Adj Close' was not found as a top-level column.
    
    # Check if 'Adj Close' is in the top level of the columns
    if 'Adj Close' not in data.columns.get_level_values(0):
        print("Warning: 'Adj Close' not found in downloaded data columns.")
        print("Available top-level columns:", data.columns.get_level_values(0).unique())
        
        # As a fallback, try to use 'Close'
        if 'Close' in data.columns.get_level_values(0):
            print("Falling back to using 'Close' price.")
            adj_close_data = data['Close']
        else:
            # If neither is present, we can't proceed
            raise Exception("'Adj Close' and 'Close' not found. Cannot proceed.")
    else:
        # This is the normal, expected path
        adj_close_data = data['Adj Close']
    
    # 4. Save to a CSV to use in your project
    adj_close_data.to_csv('stock_prices.csv')
    
    print("\nData processed and saved successfully!")
    print(adj_close_data.head())

except Exception as e:
    print(f"An error occurred: {e}")

  data = yf.download(tickers, start=start_date, end=end_date)
[*********************100%***********************]  4 of 4 completed

Data downloaded successfully. Inspecting columns...
Available top-level columns: Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Price')
Falling back to using 'Close' price.

Data processed and saved successfully!
Ticker           AAPL       AMZN       GOOG       MSFT
Date                                                  
2018-01-02  40.381001  59.450500  52.888073  79.198349
2018-01-03  40.373962  60.209999  53.756134  79.566902
2018-01-04  40.561489  60.479500  53.950798  80.267212
2018-01-05  41.023300  61.457001  54.736919  81.262375
2018-01-08  40.870930  62.343498  54.970818  81.345314





In [2]:
import numpy as np

# Compute log10(a_n / a_{n-1}) for all columns except the first
import numpy as np

log_returns = adj_close_data.copy()

# Compute ratios and take log10; this updates only the selected columns
log_returns = np.log10(log_returns / log_returns.shift(1))

# Drop the first row which will be NaN due to the shift and reset the index
log_returns = log_returns.dropna().reset_index(drop=True)

#"squash" using arctan
log_returns = np.arctan(log_returns)

# Show the first few transformed rows
print(log_returns.head())

Ticker      AAPL      AMZN      GOOG      MSFT
0      -0.000076  0.005513  0.007070  0.002016
1       0.002013  0.001940  0.001570  0.003806
2       0.004917  0.006963  0.006282  0.005351
3      -0.001616  0.006220  0.001852  0.000443
4      -0.000050  0.002026 -0.000267 -0.000295


In [3]:
#Save squashed log returns

log_returns.to_csv('squashed_log_returns.csv', index=False)