In [5]:
from sqlalchemy.orm import Session

from src.utils import get_db_engine
from create_tables import SP500


engine = get_db_engine()
session = Session(engine)

count = session.query(SP500).count()
print(f'there are {count} rows in table sp500.')


there are 0 rows in table sp500.


**check the column info**

In [8]:
for column in SP500.__table__.columns:
    print(f'Column: {column.name}, Type: {column.type}, Nullable: {column.nullable}')

Column: id, Type: INTEGER, Nullable: False
Column: ticker, Type: VARCHAR(10), Nullable: False
Column: company_name, Type: VARCHAR(50), Nullable: False
Column: sector, Type: VARCHAR(100), Nullable: False
Column: sub_industry, Type: VARCHAR(100), Nullable: False
Column: headquarters, Type: VARCHAR(50), Nullable: False
Column: date_added, Type: DATE, Nullable: False
Column: cik, Type: INTEGER, Nullable: False
Column: year_founded, Type: INTEGER, Nullable: False


In [40]:
from pathlib import Path

import pandas as pd


file_path = Path('./index/sp500.csv')
assert file_path.exists()

sp500_raw = pd.read_csv(file_path)
sp500_raw.head()

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [41]:
sp500_raw.columns

Index(['Symbol', 'Security', 'GICS Sector', 'GICS Sub-Industry',
       'Headquarters Location', 'Date added', 'CIK', 'Founded'],
      dtype='object')

In [42]:
sp500 = sp500_raw.copy()

**clean up data**

In [43]:
assert (sp500['Symbol'].str.len() < 10).all()

In [44]:
assert (sp500['Security'].str.len() < 50).all()

In [45]:
assert (sp500['GICS Sector'].str.len() < 100).all()

In [46]:
assert (sp500['GICS Sub-Industry'].str.len() < 100).all()

In [47]:
assert (sp500['Headquarters Location'].str.len() < 50).all()

In [48]:
assert (sp500['CIK'].all())

In [49]:
assert sp500['CIK'].astype(int).equals(sp500['CIK'].apply(int))

In [50]:
assert sp500['Founded'].all()
assert not (sp500['Founded'].str.len() == 4).all()

In [53]:
df_multi_year = sp500[sp500['Founded'].str.len() > 4]
df_multi_year.head()

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
23,AMCR,Amcor,Materials,Paper & Plastic Packaging Products & Materials,"Warmley, Bristol, United Kingdom",2019-06-07,1748790,2019 (1860)
36,AON,Aon plc,Financials,Insurance Brokers,"London, United Kingdom",1996-04-23,315293,1982 (1919)
47,T,AT&T,Communication Services,Integrated Telecommunication Services,"Dallas, Texas",1983-11-30,732717,1983 (1885)
57,BAC,Bank of America,Financials,Diversified Banks,"Charlotte, North Carolina",1976-06-30,70858,1998 (1923 / 1874)


In [54]:
print(f'there are {df_multi_year.shape[0]} companies multiple founded years.')

there are 42 companies multiple founded years.


to simplify things, just use the most recent founded year

In [None]:
df_special_char_years = sp500[sp500['Founded'].str.contains()]

In [65]:
import re

def parse_recent_founded_year(year_string: str) -> int:
    years = re.findall(r'\d{4}', year_string)
    return max([int(year) for year in years]) 


assert parse_recent_founded_year('1998 (1923 / 1874)') == 1998
assert parse_recent_founded_year('1998,1920(1923/1874)') == 1998

In [63]:
sp500['Founded'] = sp500['Founded'].apply(parse_recent_founded_year)

In [67]:
assert (sp500['Founded'] > 1700).all()
assert (sp500['Founded'] < 2025).all()

In [69]:
sp500.iloc[sp500['Founded'].argmin()]

Symbol                                                 BK
Security                                       BNY Mellon
GICS Sector                                    Financials
GICS Sub-Industry        Asset Management & Custody Banks
Headquarters Location             New York City, New York
Date added                                     1995-03-31
CIK                                               1390777
Founded                                              1784
Name: 66, dtype: object

In [70]:
sp500['Founded'].describe()

count     503.000000
mean     1956.920477
std        48.723728
min      1784.000000
25%      1923.000000
50%      1971.000000
75%      1994.000000
max      2024.000000
Name: Founded, dtype: float64

In [72]:
sp500.sort_values(by='Founded', ascending=True).head()

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
66,BK,BNY Mellon,Financials,Asset Management & Custody Banks,"New York City, New York",1995-03-31,1390777,1784
423,STT,State Street Corporation,Financials,Asset Management & Custody Banks,"Boston, Massachusetts",2003-03-14,93751,1792
114,CL,Colgate-Palmolive,Consumer Staples,Household Products,"New York City, New York",1957-03-04,21665,1806
222,HIG,Hartford (The),Financials,Property & Casualty Insurance,"Hartford, Connecticut",1957-03-04,874766,1810
76,BG,Bunge Global,Consumer Staples,Agricultural Products & Services,"Chesterfield, Missouri",2023-03-15,1996862,1818


In [74]:
sp500.sort_values(by='Founded', ascending=False).head()

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
209,GEV,GE Vernova,Industrials,Heavy Electrical Equipment,"Cambridge, Massachusetts",2024-04-02,1996810,2024
418,SOLV,Solventum,Health Care,Health Care Technology,"Saint Paul, Minnesota",2024-04-01,1964738,2023
446,TKO,TKO Group Holdings,Communication Services,Movies & Entertainment,"New York City, New York",2025-03-24,1973266,2023
467,VLTO,Veralto,Industrials,Environmental & Facilities Services,"Waltham, Massachusetts",2023-10-02,1967680,2023
483,WBD,Warner Bros. Discovery,Communication Services,Broadcasting,"New York City, New York",2022-04-11,1437107,2022
