In [4]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import bs4 as bs
import datetime as dt
import os
import pandas as pd
#import pandas_datareader.data as web
import yfinance as yf
import pickle
import requests


def save_sp500_tickers():
    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        tickers.append(ticker)
    with open("sp500tickers.pickle", "wb") as f:
        pickle.dump(tickers, f)
    return tickers



def get_data_from_yahoo(reload_sp500=False):
    if reload_sp500:
        tickers = save_sp500_tickers()
    else:
        with open("sp500tickers.pickle", "rb") as f:
            tickers = pickle.load(f)
    if not os.path.exists('stock_dfs'):
        os.makedirs('stock_dfs')

    start = dt.datetime(2010, 1, 1)
    end = dt.datetime(2024, 6, 19)
    for ticker in tickers:
        # just in case your connection breaks, we'd like to save our progress!
        if not os.path.exists('stock_dfs/{}.csv'.format(ticker)):
            #df = web.DataReader(ticker, 'yahoo', start, end)
            try: # this will try to download the data and save it to a file
                df = yf.download(ticker, start, end)
                df.reset_index(inplace=True)
                df.set_index("Date", inplace=True)
                #df = df.drop("Symbol", axis=1)
                df.to_csv('stock_dfs/{}.csv'.format(ticker))
            except: # if the download fails, print the ticker and continue
                print(f"Could not download {ticker}")
                continue
        else:
            print('Already have {}'.format(ticker))


def compile_data():
    with open("sp500tickers.pickle", "rb") as f:
        tickers = pickle.load(f)

    main_df = pd.DataFrame()

    for count, ticker in enumerate(tickers):
        try: # this will try to read the data from the file and add it to the main dataframe
            df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
            df.set_index('Date', inplace=True)
            print(ticker)
            df.rename(columns={'Adj Close': ticker, 'Volume': ticker+'_VOL'}, inplace=True)
            df.drop(['Open', 'High', 'Low', 'Close'], axis=1, inplace=True)

            if main_df.empty:
                main_df = df
            else:
                main_df = main_df.join(df, how='outer')

            if count % 10 == 0:
                print(count)
        except: # if the file does not exist, print the ticker and continue
            print(f"Could not read {ticker}")
            continue
    main_df.columns = [x.replace("\n", " ") for x in main_df.columns.to_list()]
    print(main_df.head())
    main_df.to_csv('sp500_joined_closes.csv')



save_sp500_tickers()
get_data_from_yahoo(reload_sp500=False)
compile_data()

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

$BF.B: possibly delisted; No price data found  (1d 2010-01-01 00:00:00 -> 2024-06-19 00:00:00)



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%******

MMM

0
AOS

ABT

ABBV

ACN

ADBE

AMD

AES

AFL

A

APD

10
ABNB

AKAM

ALB

ARE

ALGN

ALLE

LNT

ALL

GOOGL

GOOG

20
MO

AMZN

AMCR

AEE

AAL

AEP

AXP

AIG

AMT

AWK

30
AMP

AME

AMGN

APH

ADI

ANSS

AON

APA

AAPL

AMAT

40
APTV

ACGL

ADM

ANET

AJG

AIZ

T

ATO

ADSK

ADP

50
AZO

AVB

AVY

AXON

BKR

BALL

BAC

BK

BBWI

BAX

60
BDX

BRK.B

BBY

BIO

TECH

BIIB

BLK

BX

BA

BKNG

70
BWA

BSX

BMY

AVGO

BR

BRO

BF.B

BLDR

BG

BXP

80
CDNS

CZR

CPT

CPB

COF

CAH

KMX

CCL

CARR

CTLT

90
CAT

CBOE

CBRE

CDW

CE

COR

CNC

CNP

CF

CHRW

100
CRL

SCHW

CHTR

CVX

CMG

CB

CHD

CI

CINF

CTAS

110
CSCO

C

CFG

CLX

CME

CMS

KO

CTSH

CL

CMCSA

120
CAG

COP

ED

STZ

CEG

COO

CPRT

GLW

CPAY

CTVA

130
CSGP

COST

CTRA

CRWD

CCI

CSX

CMI

CVS

DHR

DRI

140
DVA

DAY

DECK

DE

DAL

DVN

DXCM

FANG

DLR

DFS

150
DG

DLTR

D

DPZ

DOV

DOW

DHI

DTE

DUK

DD

160
EMN

ETN

EBAY

ECL

EIX

EW

EA

ELV

EMR

ENPH

170
ETR

EOG

EPAM

EQT

EFX

EQIX

EQR

ESS

EL

ETSY

1

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import bs4 as bs
import datetime as dt
import os
import pandas as pd
#import pandas_datareader.data as web
import yfinance as yf
import pickle
import requests

def get_data_from_yahoo_AAPL():
    tickers = ['AAPL']
    directory = 'stock_dfs'
    if not os.path.exists(directory):
        try:
            os.makedirs(directory, exist_ok=True)
        except FileExistsError:
            pass

    start = dt.datetime(2010, 1, 1)
    end = dt.datetime(2024, 6, 19)
    for ticker in tickers:
        # just in case your connection breaks, we'd like to save our progress!
        if not os.path.exists('stock_dfs/{}.csv'.format(ticker)):
            df = yf.download(ticker, start, end)
            df.reset_index(inplace=True)
            df.set_index("Date", inplace=True)
            df.to_csv('stock_dfs/{}.csv'.format(ticker))
        else:
            print('Already have {}'.format(ticker))


def compile_data_AAPL():
    tickers = ["AAPL"]

    main_df = pd.DataFrame()

    for count, ticker in enumerate(tickers):
        df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
        df.set_index('Date', inplace=True)
        print(ticker)
        df.rename(columns={'Adj Close': ticker, 'Volume': ticker+'_VOL'}, inplace=True)
        df.drop(['Open', 'High', 'Low', 'Close'], axis=1, inplace=True)

        if main_df.empty:
            main_df = df
        else:
            main_df = main_df.join(df, how='outer')

        if count % 10 == 0:
            print(count)
    main_df.columns = [x.replace("\n", " ") for x in main_df.columns.to_list()]
    print(main_df.head())
    main_df.to_csv('AAPL.csv')



get_data_from_yahoo_AAPL()
compile_data_AAPL()


[*********************100%%**********************]  1 of 1 completed


AAPL
0
                AAPL   AAPL_VOL
Date                           
2010-01-04  6.461977  493729600
2010-01-05  6.473146  601904800
2010-01-06  6.370185  552160000
2010-01-07  6.358407  477131200
2010-01-08  6.400680  447610800


In [3]:
get_data_from_yahoo_AAPL()

[*********************100%%**********************]  1 of 1 completed


In [6]:
df = pd.read_csv('sp500_joined_closes.csv', index_col=0)
df.head()

Unnamed: 0_level_0,MMM,MMM _VOL,AOS,AOS _VOL,ABT,ABT _VOL,ABBV,ABBV _VOL,ACN,ACN _VOL,...,XYL,XYL _VOL,YUM,YUM _VOL,ZBRA,ZBRA _VOL,ZBH,ZBH _VOL,ZTS,ZTS _VOL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-04,44.254013,3640265,5.9864,1104600,18.852148,10829095,,,32.071804,3650100,...,,,19.064411,2962274,28.67,168800,52.470428,805872,,
2010-01-05,43.976822,3405012,5.909908,1207200,18.699837,10562109,,,32.27002,2613000,...,,,18.999205,3298757,28.620001,168800,54.131435,1769643,,
2010-01-06,44.600498,6301126,5.912591,663000,18.803686,11401417,,,32.613075,5772100,...,,,18.863394,4178981,28.4,385300,54.113968,1315619,,
2010-01-07,44.632473,5346240,5.930039,564000,18.959454,12857232,,,32.582581,4022900,...,,,18.857958,2452472,27.690001,183600,55.355339,1734005,,
2010-01-08,44.946983,4073337,6.017264,504600,19.056381,12148604,,,32.452988,5069700,...,,,18.863394,3772392,27.6,266500,54.192635,2213985,,
