In [8]:
import yfinance as yf
import pandas as pd

In [9]:
# Download S&P 500 ticker symbols
snp500_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
data = pd.read_html(snp500_url)
sp500_tickers = data[0]['Symbol'].str.replace('.', '-').tolist()

In [10]:
def cleanData(dataframe):
    df = dataframe.reset_index()
    # Format the data as per the specified format
    df = df[["Date", "Open", "High", "Low", "Close", "Volume"]]
    df = df.rename(columns={'Date':'timestamp', 'Open':'open', 'High':'high', 
                            'Low':'low','Close':'close', 'Volume':'volume'})
    
    df["timestamp"] = df["timestamp"].dt.strftime('%Y-%m-%d %H:%M:%S.%f')
    # Round numeric columns to 2 decimal places
    numeric_cols = ["open", "high", "low", "close", "volume"]
    df[numeric_cols] = df[numeric_cols].round(2) 
    df['open'] = df['open'].astype(float)
    df['high'] = df['high'].astype(float)
    df['low'] = df['low'].astype(float)
    df['close'] = df['close'].astype(float)
    df['volume'] = df['volume'].astype(float)
    df = df.dropna()
    df = df.reindex(columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'], index=df.index[::-1])
    df.reset_index(drop=True, inplace=True)
    # df['timestamp'] = df['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S.%f')
    df.set_index(keys='timestamp', inplace=True)
    
    # df = df.reindex()
    # df = df.sort_values(by='timestamp', ascending=True)
    # df = df.reset_index(drop=True)
    # df = df.groupby('timestamp').last().reset_index()
    return df

In [11]:
# Create an empty dictionary to store ticker symbols and their volumes
volumes = {}

# Defining Start & End Date
start_date = "2009-01-01"
end_date = "2022-07-01"

In [12]:
ticker = 'AAPL'
data = yf.download(ticker, start=start_date, end=end_date, progress=False)     
data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-01-02,3.067143,3.251429,3.041429,3.241071,2.747389,746015200
2009-01-05,3.327500,3.435000,3.311071,3.377857,2.863340,1181608400
2009-01-06,3.426786,3.470357,3.299643,3.322143,2.816113,1289310400
2009-01-07,3.278929,3.303571,3.223571,3.250357,2.755261,753048800
2009-01-08,3.229643,3.326786,3.215714,3.310714,2.806425,673500800
...,...,...,...,...,...,...
2022-06-24,139.899994,141.910004,139.770004,141.660004,140.444214,89116800
2022-06-27,142.699997,143.490005,140.970001,141.660004,140.444214,70207900
2022-06-28,142.130005,143.419998,137.320007,137.440002,136.260422,67083400
2022-06-29,137.460007,140.669998,136.669998,139.229996,138.035049,66242400


In [13]:
data = cleanData(data) 
data

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-06-30 00:00:00.000000,137.25,138.37,133.77,136.72,9.896450e+07
2022-06-29 00:00:00.000000,137.46,140.67,136.67,139.23,6.624240e+07
2022-06-28 00:00:00.000000,142.13,143.42,137.32,137.44,6.708340e+07
2022-06-27 00:00:00.000000,142.70,143.49,140.97,141.66,7.020790e+07
2022-06-24 00:00:00.000000,139.90,141.91,139.77,141.66,8.911680e+07
...,...,...,...,...,...
2009-01-08 00:00:00.000000,3.23,3.33,3.22,3.31,6.735008e+08
2009-01-07 00:00:00.000000,3.28,3.30,3.22,3.25,7.530488e+08
2009-01-06 00:00:00.000000,3.43,3.47,3.30,3.32,1.289310e+09
2009-01-05 00:00:00.000000,3.33,3.43,3.31,3.38,1.181608e+09


In [14]:
def get_mean_volume(ticker, start_date, end_date):
    try:
        data = yf.download(ticker, start=start_date, end=end_date, progress=False)     
        # Use mean volume for simplicity
        mean_volume = data['Volume'].mean()
        return mean_volume
    except Exception as e:
        print(f"Error downloading data for {ticker}: {e}")
        return None

In [15]:
for ticker in sp500_tickers:
    mean_volume = get_mean_volume(ticker, start_date, end_date)
    # print(f"Downloaded data for {ticker}")
    if mean_volume is not None:
        volumes[ticker] = mean_volume


1 Failed download:
['GEHC']: Exception("%ticker%: Data doesn't exist for startDate = 1230786000, endDate = 1656648000")

1 Failed download:
['KVUE']: Exception("%ticker%: Data doesn't exist for startDate = 1230786000, endDate = 1656648000")

1 Failed download:
['VLTO']: Exception("%ticker%: Data doesn't exist for startDate = 1230786000, endDate = 1656648000")


In [16]:
# Sort the dictionary by volume in descending order
sorted_volumes = sorted(volumes.items(), key=lambda x: x[1], reverse=True)
# Take the top 100 stocks
top_100_tickers = [ticker[0] for ticker in sorted_volumes[:100]]

In [17]:
# Download historical data for the top 100 stocks
for ticker in top_100_tickers:
    try:
        data = yf.download(ticker, start=start_date, end=end_date, progress=False)
        data = cleanData(data)
        data.to_csv(f"/home/mhmunem/finapps/snp100-data-backtest/data/snp100/daily/{ticker}.csv")
        # Do something with the data, e.g., save it to a file or process it further
        print(f"Downloaded data for {ticker}")
    except Exception as e:
        print(f"Error downloading data for {ticker}: {e}")


Downloaded data for AAPL
Downloaded data for BAC
Downloaded data for AMZN
Downloaded data for TSLA
Downloaded data for GOOGL
Downloaded data for GOOG
Downloaded data for F
Downloaded data for AMD
Downloaded data for T
Downloaded data for CSCO
Downloaded data for C
Downloaded data for CMCSA
Downloaded data for AAL
Downloaded data for KO
Downloaded data for AMAT
Downloaded data for BSX
Downloaded data for CCL
Downloaded data for BMY
Downloaded data for MO
Downloaded data for SCHW
Downloaded data for ABT
Downloaded data for COP
Downloaded data for AIG
Downloaded data for CVX
Downloaded data for ABBV
Downloaded data for BA
Downloaded data for ABNB
Downloaded data for BK
Downloaded data for AXP
Downloaded data for CAT
Downloaded data for AES
Downloaded data for CARR
Downloaded data for CF
Downloaded data for AFL
Downloaded data for BKR
Downloaded data for BBY
Downloaded data for BAX
Downloaded data for APA
Downloaded data for CFG
Downloaded data for BBWI
Downloaded data for CTSH
Downloaded 

In [None]:
!zipline bundles

In [None]:
!zipline ingest -b snp100bundles

In [None]:
!zipline ingest --help