# sp500_scrape.ipynb
----

Written in the Python 3.7.9 Environment

By Nicole Lund 

This Jupyter Notebook scrapes an html table of S&P500 companies from

Wikimedia Foundation. (2021, June 4). List of S&amp;P 500 companies. Wikipedia. https://en.wikipedia.org/wiki/List_of_S%26P_500_companies. 

In [1]:
# Import Dependencies
import pandas as pd

In [2]:
# Source URL
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

In [3]:
# Scrape tables from URL
tables_as_list = pd.read_html(url)
tables_as_list

[    Symbol             Security SEC filings             GICS Sector  \
 0      MMM           3M Company     reports             Industrials   
 1      ABT  Abbott Laboratories     reports             Health Care   
 2     ABBV          AbbVie Inc.     reports             Health Care   
 3     ABMD              Abiomed     reports             Health Care   
 4      ACN            Accenture     reports  Information Technology   
 ..     ...                  ...         ...                     ...   
 500    YUM      Yum! Brands Inc     reports  Consumer Discretionary   
 501   ZBRA   Zebra Technologies     reports  Information Technology   
 502    ZBH        Zimmer Biomet     reports             Health Care   
 503   ZION        Zions Bancorp     reports              Financials   
 504    ZTS               Zoetis     reports             Health Care   
 
                       GICS Sub-Industry    Headquarters Location  \
 0              Industrial Conglomerates      St. Paul, Minnesota

In [4]:
# Extract Table of Interest
full_sp500_df = tables_as_list[0]
full_sp500_df

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M Company,reports,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",1976-08-09,66740,1902
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
2,ABBV,AbbVie Inc.,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
3,ABMD,Abiomed,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981
4,ACN,Accenture,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
...,...,...,...,...,...,...,...,...,...
500,YUM,Yum! Brands Inc,reports,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
501,ZBRA,Zebra Technologies,reports,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
502,ZBH,Zimmer Biomet,reports,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927
503,ZION,Zions Bancorp,reports,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,109380,1873


In [5]:
# Remove Unneccesary Columns
sp500_df = full_sp500_df[['Symbol','Security','GICS Sector','GICS Sub-Industry']]
sp500_df

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry
0,MMM,3M Company,Industrials,Industrial Conglomerates
1,ABT,Abbott Laboratories,Health Care,Health Care Equipment
2,ABBV,AbbVie Inc.,Health Care,Pharmaceuticals
3,ABMD,Abiomed,Health Care,Health Care Equipment
4,ACN,Accenture,Information Technology,IT Consulting & Other Services
...,...,...,...,...
500,YUM,Yum! Brands Inc,Consumer Discretionary,Restaurants
501,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments
502,ZBH,Zimmer Biomet,Health Care,Health Care Equipment
503,ZION,Zions Bancorp,Financials,Regional Banks


In [6]:
# Rename Columns and Set Index
sp500_df = sp500_df.rename(columns={'Symbol':'ticker','Security':'security_name','GICS Sector':'gics_sector','GICS Sub-Industry':'gics_sub_industry'})
sp500_df = sp500_df.set_index('ticker')
sp500_df

Unnamed: 0_level_0,security_name,gics_sector,gics_sub_industry
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,3M Company,Industrials,Industrial Conglomerates
ABT,Abbott Laboratories,Health Care,Health Care Equipment
ABBV,AbbVie Inc.,Health Care,Pharmaceuticals
ABMD,Abiomed,Health Care,Health Care Equipment
ACN,Accenture,Information Technology,IT Consulting & Other Services
...,...,...,...
YUM,Yum! Brands Inc,Consumer Discretionary,Restaurants
ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments
ZBH,Zimmer Biomet,Health Care,Health Care Equipment
ZION,Zions Bancorp,Financials,Regional Banks


In [7]:
# Review Data for NaNs - Result indicates no NaNs present in the data
sp500_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 505 entries, MMM to ZTS
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   security_name      505 non-null    object
 1   gics_sector        505 non-null    object
 2   gics_sub_industry  505 non-null    object
dtypes: object(3)
memory usage: 15.8+ KB


In [8]:
# Review Data for Duplicates  - Result indicates no duplicates present in the data
sp500_df.drop_duplicates().shape

(505, 3)

In [9]:
# Export DataFrame to csv
sp500_df.to_csv('sp500.csv')