In [None]:
import yfinance as yf
import pandas as pd  
import requests                                                                     # Importing packages from my Python venv 
from bs4 import BeautifulSoup 
import sqlite3
import locale
import altair as alt
from vega_datasets import data

Data Scraping 

In [None]:
sp500_url = 'http://en.wikipedia.org/wiki/List_of_S%26P_500_companies'              # Assigning link to 'sp500_url'
response = requests.get(sp500_url)                                                  # Using request to get access to 'sp500_url'

In [None]:
if response.status_code ==200:
    print('Request successful')
else:                                                                              # Checking status = 200 code using 'if' & 'else'
    print('Request not successful')

In [None]:
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find ('table')                                       # Using Beautifulsoup() to find table within my url in 'response'

In [None]:
data_table = pd.read_html(str(table))[0]            # Using pandas to read link from table and assigning a value to a variable 'data_table'

In [None]:
print("Resulting DataFrame:")

print(data_table.head(503))      

Data Formatting .tolist()

In [None]:
tickers = data_table ['Symbol'].tolist()                                                              # Assigning 'tickers' to only the 'Symbol' of the 'data_table' and convert into a list()
tickers_list = [ticker.replace('BF.B', 'BF-B').replace('BRK.B', 'BRK-B') for ticker in tickers]       # Replacing two incorrect Value 'tickers' within a variable 'tickers_list'

print('Tickers list')
print(tickers_list) 

In [None]:
sectors_list = data_table ['GICS Sector'].tolist()       # Convert 'GICS Sector' into a list() within 'data_table' assigned to 'sectors_list'

print('Sectors list')
print(sectors_list)

In [None]:
stocks_list = data_table ['Security'].tolist()         # Convert 'Security' into a list() within 'data_table' assigned to 'stocks_list'

print('Stocks list')
print(stocks_list)

In [None]:
Industry_list = data_table ['GICS Sub-Industry'].tolist()                     # Convert 'GICS Sub-Industry' into a list() within 'data_table' assigned to 'Industry_list'

print('Industry list')
print(Industry_list)

In [None]:
Location_list = data_table ['Headquarters Location'].tolist()                                  # Convert 'Headquarters Location' into a list() within 'data_table' assigned to 'Location_list'

print('Headquarters Location list')
print(Location_list)

DataFrame Creation

In [None]:

Sp500_data_table_columns = pd.DataFrame({
    'Tickers': tickers_list,
    'Stocks': stocks_list,                                                  # Adding my variables ('tickers_list','stocks_list, and 'sectors_list') to a new Dataframe 'Sp500_columns' using pandas & {}
    'Sectors': sectors_list,
    'Industry': Industry_list,
    'Location': Location_list,}) 

print(Sp500_data_table_columns)

Importing Data From Library

In [None]:
tickers_volume = yf.download(tickers_list, period="1y", interval="1d")[['Volume']]    # Using 'yf' to download all the info in the 'tickers_list' between those dates and assigning it to 'tickers_price'

Calculate The Volume For Each Ticker

In [None]:
daily_volume = tickers_volume['Volume'].resample('D').sum()                                                # Calculate the daily volume for each ticker

weekly_average_volume = tickers_volume['Volume'].resample('W').mean()                               # Calculate the daily average volume for each ticker

yearly_average_volume = tickers_volume['Volume'].resample('Y').mean()                               # Calculate the yearly average volume for each ticker

In [None]:
print("Daily Volume for Each Ticker:")

daily_volume.sort_values(by='Date',ascending=False)

In [None]:
print("Weekly Average Volume for Each Ticker:")

weekly_average_volume.sort_values(by='Date',ascending=False)

In [None]:
print("\nYearly Average Volume for Each Ticker:")

print(yearly_average_volume)

Data Scraping 

In [None]:
market_cap_url = 'https://www.liberatedstocktrader.com/sp-500-companies/'            
response_2 = requests.get(market_cap_url)

In [None]:
if response_2.status_code == 200 :
 print('Request successful')                                                                        # Checking status = 200 code using 'if' & 'else'
else: print('Request not successful')

In [None]:
soup_2 = BeautifulSoup(response_2.content, 'html.parser')                                        # Using Beautifulsoup() to find table within my url in 'response_2'
market_table = soup_2.find_all ('table')                                                         # Using 'find_all' to read all the tables on the web
Sp500_columns = pd.read_html(str(market_table[0]))[0]                                            # Using pd() to read the first table on the web

Data Formatting (Market Cap) & Function Creation 

In [None]:
Sp500_columns.iloc[0] = ['Tickers', 'Names', 'Sectors', 'Market Cap']                                                   # Assigns names to the columns of the DataFrame.

def format_market_cap(market_cap):                                                                                      # Defines a function to format market cap values.
        try:
            market_cap = float(market_cap.replace('$', '').replace('B', 'e9').replace('M', 'e6').replace('T', 'e12'))   # Converts the market cap value to a floating-point number.
            if market_cap >= 1e12:
                return "${:.2f} T".format(market_cap / 1e12)                                                            # The function then checks the magnitude of the market cap and appends 'T', 'B', or 'M' accordingly.
            elif market_cap >= 1e9:
                return "${:.2f} B".format(market_cap / 1e9)
            else:
                return "${:.2f} M".format(market_cap / 1e6)
            
        except ValueError:
            return market_cap

Sp500_columns[3] = Sp500_columns[3].apply(format_market_cap)     

print("Resulting DataFrame:")
print(Sp500_columns)

Data Merging (JOIN) Drop and Rename 

In [None]:
Merged_Sp500_columns = pd.merge(Sp500_data_table_columns, Sp500_columns[[0,3]], left_on='Tickers', right_on=0, how='left')             # pd.merge is used to merge two DataFrames 'JOIN'

Merged_Sp500_columns.drop(0, axis=1, inplace=True)                                                                                     # Dropping Unnecessary Columns in merged DataFrames
Merged_Sp500_columns.drop('Location', axis=1, inplace=True)
Merged_Sp500_columns.drop('Industry', axis=1, inplace=True)

Merged_Sp500_columns.rename(columns={ 3: 'Market_Cap'}, inplace=True)                                                                   # Renaming Columns

Data Mapping Updates Columns within DataFrame

In [None]:
market_cap_mapping = {
    'ABNB': '$85.698B',
    'BRK-B': '$773.209B',                                                                                                   # 'Mapping' keys are tickers, and values are the corresponding market cap values 
    'BX': '$139.435B',
    'BF-B': '$29.172B',
    'COR': '$40.92B',
    'HUBB': '$16.077B',
    'KVUE': '$38.623B',
    'LULU': '$57.82B',
    'VLTO': '$18.609B'}

for ticker, market_cap_value in market_cap_mapping.items():                                                                 
    Merged_Sp500_columns.loc[Merged_Sp500_columns['Tickers'] == ticker, 'Market_Cap'] = market_cap_value                     # Locates rows in the DataFrame  where 'Tickers' match the current `ticker`

 Function Creation 

In [None]:
def calculate_monthly_average(ticker):                                                                               # Defines a function to calculate the Daily Volume
    ticker_monthly_data = yf.download(ticker, period="1y", interval="1d")
   monthly_average = ticker_monthly_data['Volume'].resample('M').mean()                                                # Calculates the mean of the Daily Volume for each ticker
    return monthly_average.mean()  

Merged_Sp500_columns['Average_Monthly_Volume'] = Merged_Sp500_columns['Tickers'].apply(calculate_monthly_average)

In [None]:
def calculate_weekly_average(ticker):                                                                               # Defines a function to calculate the weekly average
    ticker_data = yf.download(ticker, period="1y", interval="1d")
    weekly_average = ticker_data['Volume'].resample('W').mean()                                                     # Calculates the mean of the weekly average volumes for each ticker
    return weekly_average.mean()  

Merged_Sp500_columns['Average_Weekly_Volume'] = Merged_Sp500_columns['Tickers'].apply(calculate_weekly_average)

Data Formatting (Volume) 

In [None]:
locale.setlocale(locale.LC_ALL, '')                                                                                # Using 'locale' to  formatting the 'Daily Volume' column in a DataFrame by adding commas as thousand separators to make it more readable.

Merged_Sp500_columns['Average_Monthly_Volume'] = Merged_Sp500_columns['Average_Monthly_Volume'].apply(lambda x: locale.format_string("%d", x, grouping=True))

In [None]:
locale.setlocale(locale.LC_ALL, '')                                                                                # Using 'locale' to  formatting the 'Weekly Average Volume' column in a DataFrame by adding commas as thousand separators to make it more readable.

Merged_Sp500_columns['Average_Weekly_Volume'] = Merged_Sp500_columns['Average_Weekly_Volume'].apply(lambda x: locale.format_string("%d", x, grouping=True))

In [39]:
Merged_Sp500_columns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Tickers                 503 non-null    object
 1   Stocks                  503 non-null    object
 2   Sectors                 503 non-null    object
 3   Market_Cap              503 non-null    object
 4   Average_Monthly_Volume  503 non-null    int64 
 5   Average_Weekly_Volume   503 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 23.7+ KB


In [37]:
print("Resulting S&P 500 DataFrame:")

Merged_Sp500_columns.sort_values(by='Stocks',ascending=True)

Resulting S&P 500 DataFrame:


Unnamed: 0,Tickers,Stocks,Sectors,Market_Cap,Average_Monthly_Volume,Average_Weekly_Volume
0,MMM,3M,Industrials,$55.20 B,3521469,3526526
1,AOS,A. O. Smith,Industrials,$10.37 B,1045112,1046008
7,AES,AES Corporation,Utilities,$12.06 B,5908709,5925824
39,APA,APA Corporation,Energy,$13.59 B,5000462,5095130
48,T,AT&T,Communication Services,$100.44 B,38979177,38228193
...,...,...,...,...,...,...
499,ZBRA,Zebra Technologies,Information Technology,$13.96 B,394655,403305
500,ZBH,Zimmer Biomet,Health Care,$24.90 B,1495374,1511213
501,ZION,Zions Bancorporation,Financials,$5.15 B,3659809,3750224
502,ZTS,Zoetis,Health Care,$83.58 B,1925033,1972249


Exporting DataFrame to SQLite 

In [36]:
conn = sqlite3.connect('SP500_Database.db')  

Merged_Sp500_columns.to_sql('SP500_Columns_VL', conn, index=False, if_exists='replace')

conn.close() 

Exporting DataFrame to Excel

In [None]:
Merged_Sp500_columns.to_excel('SP500_excel_file.xlsx', index=False) 