**download SP500 stock data using yfinance**

In [1]:
from src.utils import sp500_symbols

symbols = sp500_symbols()
print(f'there are {len(symbols)} stocks in SP500 list.')

there are 503 stocks in SP500 list.


In [2]:
print(f'some SP500 stock symbol: {symbols[:5]}')

some SP500 stock symbol: ['MMM', 'AOS', 'ABT', 'ABBV', 'ACN']


In [None]:
# for more info:  https://github.com/ranaroussi/yfinance/issues/2325

import time 

import yfinance as yf
from requests import Session
from requests_cache import CacheMixin, SQLiteCache
from requests_ratelimiter import LimiterMixin, MemoryQueueBucket
from pyrate_limiter import Duration, RequestRate, Limiter


class CachedLimiterSession(CacheMixin, LimiterMixin, Session):  #type:ignore
    pass

# https://help.yahooinc.com/dsp-api/docs/rate-limits
# Define multiple rate limits
yf_limiter = Limiter(
    RequestRate(60, Duration.MINUTE),      # Max 60 requests per minute
    RequestRate(360, Duration.HOUR),       # Max 360 requests per hour
    RequestRate(8000, Duration.DAY)         # Max 8000 requests per day
)

yf_session = CachedLimiterSession(
    limiter=yf_limiter,
    bucket_class=MemoryQueueBucket,
    backend=SQLiteCache("yfinance.cache"),
)

period = 'max'  # '5d'  '1y'  'max'

for symbol in symbols[:10]:
    tickers = [symbol]
    data = yf.download(
                        tickers=tickers,
                        period=period,
                        rounding = True,
                        group_by='ticker',
                        auto_adjust=True,
                        threads=1,  #type:ignore
                        session=yf_session
                    )
    
    time.sleep(7)

In [None]:
import time


for ticker in symbols[:10]:
    try:
        data = yf.Ticker(ticker).history(period="1mo")
        time.sleep(5)  # Add delay
    except Exception as e:
        print(f"Rate limit hit. Waiting before retrying...")
        time.sleep(60)  # Wait longer before retrying


In [6]:
import yfinance as yf


period = 'max'  # '5d'  '1y'  'max'

stocks_df_raw = yf.download(tickers=symbols, group_by='Ticker', period=period, rounding=True)

[*********************100%***********************]  503 of 503 completed


In [11]:
assert stocks_df_raw is not None
stocks_df_raw.head()

Ticker,CTVA,CTVA,CTVA,CTVA,CTVA,PPL,PPL,PPL,PPL,PPL,...,AIG,AIG,AIG,AIG,AIG,GL,GL,GL,GL,GL
Price,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,...,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1962-01-02,,,,,,,,,,,...,,,,,,,,,,
1962-01-03,,,,,,,,,,,...,,,,,,,,,,
1962-01-04,,,,,,,,,,,...,,,,,,,,,,
1962-01-05,,,,,,,,,,,...,,,,,,,,,,
1962-01-08,,,,,,,,,,,...,,,,,,,,,,


In [12]:
assert stocks_df_raw is not None
stocks_df_raw.tail()

Ticker,CTVA,CTVA,CTVA,CTVA,CTVA,PPL,PPL,PPL,PPL,PPL,...,AIG,AIG,AIG,AIG,AIG,GL,GL,GL,GL,GL
Price,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,...,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2025-05-05,62.34,62.98,61.91,62.41,2275900.0,36.2,36.38,35.84,36.28,3842500.0,...,83.41,84.05,82.65,83.05,2619000.0,121.19,122.65,121.14,121.49,524400.0
2025-05-06,62.34,62.84,62.0,62.45,2680800.0,36.24,36.41,36.07,36.12,3957100.0,...,82.51,83.31,82.32,82.53,2482300.0,120.74,122.24,119.84,120.34,657100.0
2025-05-07,62.67,62.82,62.02,62.48,3290700.0,36.05,36.58,36.05,36.33,4791600.0,...,82.71,83.18,81.52,81.69,3325100.0,121.08,122.39,120.66,120.96,768400.0
2025-05-08,64.63,67.25,63.82,66.86,6172800.0,36.18,36.23,35.54,35.62,4920000.0,...,82.22,83.0,81.7,81.82,3710000.0,121.9,122.87,120.6,121.23,604700.0
2025-05-09,66.71,67.93,66.57,67.79,4289100.0,35.73,35.74,35.21,35.51,4788100.0,...,82.13,82.91,81.68,82.41,4926000.0,121.59,122.54,121.28,121.94,436100.0


In [13]:
assert stocks_df_raw is not None
stocks_df_raw.columns

MultiIndex([('CTVA',   'Open'),
            ('CTVA',   'High'),
            ('CTVA',    'Low'),
            ('CTVA',  'Close'),
            ('CTVA', 'Volume'),
            ( 'PPL',   'Open'),
            ( 'PPL',   'High'),
            ( 'PPL',    'Low'),
            ( 'PPL',  'Close'),
            ( 'PPL', 'Volume'),
            ...
            ( 'AIG',   'Open'),
            ( 'AIG',   'High'),
            ( 'AIG',    'Low'),
            ( 'AIG',  'Close'),
            ( 'AIG', 'Volume'),
            (  'GL',   'Open'),
            (  'GL',   'High'),
            (  'GL',    'Low'),
            (  'GL',  'Close'),
            (  'GL', 'Volume')],
           names=['Ticker', 'Price'], length=2515)

**organize the multi-level column names**

In [14]:
assert stocks_df_raw is not None
stocks_df = stocks_df_raw.stack(level=0, future_stack=True).rename_axis(['Date', 'Ticker']).reset_index(level=1)

In [15]:
stocks_df.head()

Price,Ticker,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1962-01-02,CTVA,,,,,
1962-01-02,PPL,,,,,
1962-01-02,JBL,,,,,
1962-01-02,CARR,,,,,
1962-01-02,GILD,,,,,


In [16]:
print(f'there are {stocks_df.shape[0]:,} rows of stock records.')

there are 8,020,838 rows of stock records.


In [17]:
df = stocks_df.dropna()
print(f'there are {df.shape[0]:,} valid records.')

there are 4,308,245 valid records.


In [18]:
df.head()

Price,Ticker,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1962-01-02,HON,0.0,1.05,1.05,1.05,40740.0
1962-01-02,MO,0.0,0.0,0.0,0.0,345600.0
1962-01-02,IP,0.0,0.89,0.87,0.87,51552.0
1962-01-02,DIS,0.06,0.06,0.06,0.06,841958.0
1962-01-02,XOM,0.0,0.09,0.09,0.09,902400.0


In [19]:
df.tail()

Price,Ticker,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-05-09,LVS,39.78,40.12,39.62,39.79,5363700.0
2025-05-09,CEG,272.07,274.95,266.06,271.37,3244500.0
2025-05-09,SNPS,487.11,487.84,479.04,482.9,571500.0
2025-05-09,AIG,82.13,82.91,81.68,82.41,4926000.0
2025-05-09,GL,121.59,122.54,121.28,121.94,436100.0


In [20]:
df.describe()

Price,Open,High,Low,Close,Volume
count,4308245.0,4308245.0,4308245.0,4308245.0,4308245.0
mean,49.77447,50.44368,49.26039,49.86493,6380172.0
std,157.5594,159.4162,155.6959,157.5684,40273780.0
min,0.0,0.0,0.0,0.0,0.0
25%,4.19,4.34,4.22,4.28,497600.0
50%,17.31,17.56,17.08,17.33,1479700.0
75%,48.0,48.53,47.45,48.01,3926703.0
max,9914.17,9964.77,9794.0,9924.4,9230856000.0


**clean the data**

In [21]:
df['Volume'] = df['Volume'].fillna(0).astype(int)

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Volume'] = df['Volume'].fillna(0).astype(int)


Price,Ticker,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1962-01-02,HON,0.0,1.05,1.05,1.05,40740
1962-01-02,MO,0.0,0.0,0.0,0.0,345600
1962-01-02,IP,0.0,0.89,0.87,0.87,51552
1962-01-02,DIS,0.06,0.06,0.06,0.06,841958
1962-01-02,XOM,0.0,0.09,0.09,0.09,902400


**set 'Date' as a column**

In [22]:
df.reset_index(inplace=True)
df.head()

Price,Date,Ticker,Open,High,Low,Close,Volume
0,1962-01-02,HON,0.0,1.05,1.05,1.05,40740
1,1962-01-02,MO,0.0,0.0,0.0,0.0,345600
2,1962-01-02,IP,0.0,0.89,0.87,0.87,51552
3,1962-01-02,DIS,0.06,0.06,0.06,0.06,841958
4,1962-01-02,XOM,0.0,0.09,0.09,0.09,902400


**check db table info**

In [23]:
from src.create_tables import StockPrice

column_info = {c.name: c.type for c in StockPrice.__table__.columns}
column_info

{'id': Integer(),
 'ticker': String(length=10),
 'trade_date': Date(),
 'open_price': Numeric(precision=10, scale=2),
 'high_price': Numeric(precision=10, scale=2),
 'low_price': Numeric(precision=10, scale=2),
 'close_price': Numeric(precision=10, scale=2),
 'volume': BigInteger()}

change dataframe column name to match db table column name in order to use to_sql

In [24]:
df_renamed = df.rename(columns={'Date': 'trade_date',
                                'Ticker': 'ticker',
                                'Open': 'open_price',
                                'High': 'high_price',
                                'Low': 'low_price',
                                'Close': 'close_price',
                                'Volume': 'volume'
                                })

In [25]:
df_renamed.head()

Price,trade_date,ticker,open_price,high_price,low_price,close_price,volume
0,1962-01-02,HON,0.0,1.05,1.05,1.05,40740
1,1962-01-02,MO,0.0,0.0,0.0,0.0,345600
2,1962-01-02,IP,0.0,0.89,0.87,0.87,51552
3,1962-01-02,DIS,0.06,0.06,0.06,0.06,841958
4,1962-01-02,XOM,0.0,0.09,0.09,0.09,902400


In [None]:
from importlib import reload
from src import utils
reload(utils)

from src.utils import get_db_engine


engine = get_db_engine()

df_renamed.to_sql(name='stock_prices', con=engine, if_exists='append', index=False, chunksize=10_000)

-431

In [27]:
from sqlalchemy.orm import Session

session = Session(engine)

n_rows = session.query(StockPrice).count()

assert df_renamed.shape[0] == n_rows
print(f'There are {n_rows:,} rows in table {StockPrice.__tablename__}')

There are 4,308,245 rows in table stock_prices
