In [1]:
import yfinance as yf
from datetime import datetime
import pandas as pd

### In our case we will use S&P 500 index

In [2]:
# Set up a ticker
sp_500 = yf.Ticker(ticker='SPY')

In [3]:
sp_500.info

{'longBusinessSummary': 'The Trust seeks to achieve its investment objective by holding a portfolio of the common stocks that are included in the index (the “Portfolio”), with the weight of each stock in the Portfolio substantially corresponding to the weight of such stock in the index.',
 'maxAge': 86400,
 'priceHint': 2,
 'previousClose': 569.2345,
 'open': 567.84,
 'dayLow': 565.17,
 'dayHigh': 568.6455,
 'regularMarketPreviousClose': 569.2345,
 'regularMarketOpen': 567.84,
 'regularMarketDayLow': 565.17,
 'regularMarketDayHigh': 568.6455,
 'trailingPE': 28.544195,
 'volume': 45300109,
 'regularMarketVolume': 45300109,
 'averageVolume': 51019140,
 'averageVolume10days': 53201870,
 'averageDailyVolume10Day': 53201870,
 'bid': 566.71,
 'ask': 566.64,
 'bidSize': 900,
 'askSize': 1100,
 'yield': 0.0121,
 'totalAssets': 566341992448,
 'fiftyTwoWeekLow': 409.21,
 'fiftyTwoWeekHigh': 572.88,
 'fiftyDayAverage': 550.391,
 'twoHundredDayAverage': 518.624,
 'trailingAnnualDividendRate': 5.66

In [4]:
# Define start date and end date
data_params = {
    'start': datetime(year=2019, month=1, day=1),
    'end': datetime(year=2024, month=9, day=1),
    'period': '1d'
}

In [5]:
# Get historical market data
data = sp_500.history(**data_params)

In [6]:
data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-01-02 00:00:00-05:00,225.258024,230.047445,225.230552,229.104202,126925200,0.0,0.0,0.0
2019-01-03 00:00:00-05:00,227.318481,227.629849,223.142629,223.637146,144140700,0.0,0.0,0.0
2019-01-04 00:00:00-05:00,226.732398,231.787383,226.347781,231.128036,142628800,0.0,0.0,0.0
2019-01-07 00:00:00-05:00,231.402731,234.388095,230.486974,232.950363,103139100,0.0,0.0,0.0
2019-01-08 00:00:00-05:00,235.184827,235.633540,232.602385,235.139023,102512600,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
2024-08-26 00:00:00-04:00,563.179993,563.909973,559.049988,560.789978,35788600,0.0,0.0,0.0
2024-08-27 00:00:00-04:00,559.489990,562.059998,558.320007,561.559998,32693900,0.0,0.0,0.0
2024-08-28 00:00:00-04:00,561.210022,561.650024,555.039978,558.299988,41066000,0.0,0.0,0.0
2024-08-29 00:00:00-04:00,560.309998,563.679993,557.179993,558.349976,38715200,0.0,0.0,0.0


In [7]:
# Check if there is any columns with one value only
excluded_cols = []
for col in data.columns.values:
    if len(data.loc[:, col].unique()) == 1:
        excluded_cols.append(col)
excluded_cols

['Stock Splits', 'Capital Gains']

In [8]:
# Check "Dividends" column
data.loc[:, 'Dividends'].value_counts()

Dividends
0.000    1404
1.366       2
1.633       1
1.595       1
1.906       1
1.583       1
1.638       1
1.506       1
1.781       1
1.596       1
1.577       1
1.428       1
1.233       1
1.376       1
1.278       1
1.580       1
1.339       1
1.406       1
1.570       1
1.384       1
1.432       1
1.759       1
Name: count, dtype: int64

In [9]:
# Since the most values are zeros in "Dividends" add this column to the list of excluded columns
excluded_cols.append('Dividends')

In [10]:
# Remove the unnecessary columns
data = data.drop(labels=excluded_cols, axis=1)
data

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-02 00:00:00-05:00,225.258024,230.047445,225.230552,229.104202,126925200
2019-01-03 00:00:00-05:00,227.318481,227.629849,223.142629,223.637146,144140700
2019-01-04 00:00:00-05:00,226.732398,231.787383,226.347781,231.128036,142628800
2019-01-07 00:00:00-05:00,231.402731,234.388095,230.486974,232.950363,103139100
2019-01-08 00:00:00-05:00,235.184827,235.633540,232.602385,235.139023,102512600
...,...,...,...,...,...
2024-08-26 00:00:00-04:00,563.179993,563.909973,559.049988,560.789978,35788600
2024-08-27 00:00:00-04:00,559.489990,562.059998,558.320007,561.559998,32693900
2024-08-28 00:00:00-04:00,561.210022,561.650024,555.039978,558.299988,41066000
2024-08-29 00:00:00-04:00,560.309998,563.679993,557.179993,558.349976,38715200


In [12]:
# Save the dataset in ".csv" file
data.to_csv("s_p_500_data.csv")