# Baseball ETL

### Imports

In [2]:
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect
from config import pg_pwd
from configAB import username, password
import psycopg2
import os

from bs4 import BeautifulSoup as bs
import requests

import pandas as pd
import numpy as np

### Initializations

In [3]:
engine = create_engine(f'postgresql://{username}:{password}@localhost:5432/MoneyBall')
Base = automap_base()
Base.prepare(autoload_with=engine)
session = Session(engine)

In [3]:
#db initializations
#engine = create_engine(f'postgresql://postgres:Techbobcats=0618@localhost/MoneyBall')

#Base = automap_base()
#Base.prepare(autoload_with=engine)
#session = Session(engine)

### ETL Baseball Teams and Cities

In [34]:
# load from csv if stored

# extract
url = 'https://www.worldatlas.com/articles/mlb-teams-and-their-cities.html'
response = requests.get(url)
soup = bs(response.text, 'html.parser')
result = soup.find('tbody')
rows = result.find_all('tr')

# transfer
City = Base.classes.City 
Team = Base.classes.Team 

for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]

    teamcol = cols[1]
    citystatecol = cols[2].split(', ')
    citycol = citystatecol[0]
    statecol = citystatecol[1]

    #load to City table
    city = City(cityname=citycol, statename=statecol)  
    session.add(city)
    session.commit()    
    session.refresh(city)

    #load to Team table
    team = Team(teamname=teamcol, cityid=city.cityid)  
    session.add(team)
    session.commit()    

#reload from db and save to csv
citiesDF = pd.read_sql_query('select cityid, cityname, statename from "City"', con=engine)
citiesDF.to_csv('data/Cities.csv', index=False)

teamsDF = pd.read_sql_query('select teamid, teamname, cityid from "Team"', con=engine)
teamsDF.to_csv('data/Teams.csv', index=False)

#### Team matching dictionary

In [35]:
#put teams in dictionary for lookup
teams_dict = dict(zip(teamsDF.teamname, teamsDF.teamid))

In [36]:
# additions from payroll set
teams_dict.update({'Tampa Bay Devil Rays': 4})
teams_dict.update({'Anaheim Angels': 12})
teams_dict.update({'Los Angeles Angels of Anaheim': 12})
teams_dict.update({'Montreal Expos': 20})
teams_dict.update({'Florida Marlins': 17})

In [37]:
# additions from salary set
teams_dict.update({'ARI': 26})
teams_dict.update({'ATL': 16})
teams_dict.update({'BAL': 1}) 
teams_dict.update({'BOS': 2}) 
teams_dict.update({'CHC': 21}) 
teams_dict.update({'CHW': 6}) 
teams_dict.update({'CIN': 22}) 
teams_dict.update({'CLE': 7}) 
teams_dict.update({'COL': 27})
teams_dict.update({'DET': 8}) 
teams_dict.update({'HOU': 11}) 
teams_dict.update({'KCR': 9}) 
teams_dict.update({'LAA': 12}) 
teams_dict.update({'LAD': 28}) 
teams_dict.update({'MIA': 17}) 
teams_dict.update({'MIL': 23}) 
teams_dict.update({'MIN': 10}) 
teams_dict.update({'NYM': 18})
teams_dict.update({'NYY': 3}) 
teams_dict.update({'OAK': 13}) 
teams_dict.update({'PHI': 19}) 
teams_dict.update({'PIT': 24}) 
teams_dict.update({'SDP': 29}) 
teams_dict.update({'SEA': 14}) 
teams_dict.update({'SFG': 30}) 
teams_dict.update({'STL': 25}) 
teams_dict.update({'TBR': 4})
teams_dict.update({'TEX': 15}) 
teams_dict.update({'TOR': 5}) 
teams_dict.update({'WSN': 20})

In [38]:
# additions from pitching set
teams_dict.update({'ANA': 12}) 
teams_dict.update({'LAN': 28}) 
teams_dict.update({'CHN': 21}) 
teams_dict.update({'FLO': 17})  
teams_dict.update({'SDN': 29}) 
teams_dict.update({'SLN': 25}) 
teams_dict.update({'MON': 20}) 
teams_dict.update({'CHA': 6}) 
teams_dict.update({'KCA': 9}) 
teams_dict.update({'NYN': 18}) 
teams_dict.update({'NYA': 3}) 
teams_dict.update({'TBA': 4}) 
teams_dict.update({'SFN': 30}) 
teams_dict.update({'WAS': 20}) 

### ETL Baseball Payrolls

In [39]:
# load from csv if stored

    # extract
mlb_payrollDF = pd.DataFrame()

# combine sets into payroll dataframe
for yr in range(2000, 2016):
    set_filepath = 'data/payroll_files/mlb_payroll_' + str(yr) + '.csv'
    DF = pd.read_csv(set_filepath)
    DF["year"] = yr
    DF["teamid"] = -1
    DF = DF.drop(columns=['roster'])
    DF.columns=['team_name', 'league', 'division', 'payroll', 'w', 'l', 'wpct', 
                'rnk', 'lgrk', 'mlbrk', 'last_payroll','top_salary', 'year', 'teamid']
    mlb_payrollDF = pd.concat([mlb_payrollDF, DF])

# transfer, and identify missing team names
Payroll = Base.classes.Payroll 
Team = Base.classes.Team 

for index, row in mlb_payrollDF.iterrows():
    if teams_dict.get(row.team_name, -1) == -1:
        print(row.team_name)
    else:
        mlb_payrollDF.loc[index, ['teamid']] = teams_dict.get(row.team_name)

        #load to Payroll table
        payroll = Payroll(
            teamid = teams_dict.get(row.team_name),
            payroll = row.payroll.replace(',', ''),
            w = row.w, 
            l = row.l, 
            wpct = row.wpct, 
            rnk = row.rnk, 
            lgrk = row.lgrk, 
            mlbrk = row.mlbrk, 
            last_payroll = row.last_payroll.replace(',', ''),
            top_salary = row.top_salary, 
            year = row.year) 

        session.add(payroll)
        session.commit()    

        #update Team table with league, division
        session.query(Team).filter(Team.teamid == teams_dict.get(row.team_name)).update(
            {Team.league:row.league}, synchronize_session = False)

        session.query(Team).filter(Team.teamid == teams_dict.get(row.team_name)).update(
            {Team.division:row.division}, synchronize_session = False)

        session.commit()    

#reload from db and save to csv
payrollsDF = pd.read_sql_query('select * from "Payroll"', con=engine)
payrollsDF.to_csv('data/Payrolls.csv', index=False)

The printed teams have had name changes or have moved since 2000.  Add mapping entries for those team to get proper ids.

- Tampa Bay Devil Rays -> Tampa Bay Rays (4)
- Anaheim Angels -> Los Angeles Angels (12)
- Montreal Expos -> Washington Nationals (20)
- Florida Marlins ->  Maimi Marlins (17)
- Los Angeles Angels of Anaheim -> Los Angeles Angels (12)

In [40]:
mlb_payrollDF.head()

Unnamed: 0,team_name,league,division,payroll,w,l,wpct,rnk,lgrk,mlbrk,last_payroll,top_salary,year,teamid
0,New York Yankees,AL,East,92938260,87,74,0.54,1,5,9,88130709,Bernie Williams (4),2000,28
1,Los Angeles Dodgers,NL,West,90725953,86,76,0.531,2,5,10,71135786,Kevin Brown (1),2000,3
2,Baltimore Orioles,AL,East,83141198,74,88,0.457,4,11,21,70818363,Albert Belle (3),2000,20
3,Atlanta Braves,NL,East,82732500,95,67,0.586,1,2,2,75065000,Greg Maddux (10),2000,8
4,Boston Red Sox,AL,East,81210333,85,77,0.525,2,6,11,71720000,Pedro Martinez (8),2000,2


### ETL Baseball Player Salaries

In [41]:
basebSal= pd.read_csv('data/0519_baseball_reference.csv')
basebSal.head() 

Unnamed: 0,team,year,name,war,sal,exp,playerid,lastsal
0,ARI,2005,Armando Almanza,0.2,0,7,almanar01,500000.0
1,ARI,2005,Greg Aquino,-1.0,325000,2,aquingr01,300000.0
2,ARI,2005,Brian Bruney,-1.6,322500,2,brunebr01,300000.0
3,ARI,2005,Randy Choate,-0.4,550000,6,choatra01,325750.0
4,ARI,2005,Alex Cintron,-0.2,360000,5,cintral01,335000.0


In [42]:
# load from csv if stored
# transfer, and identify missing salaries
Salary = Base.classes.Salary 

for index, row in basebSal.iterrows():
    if teams_dict.get(row.team, -1) == -1:
        print(row.team)
    else:
        basebSal.loc[index, ['teamid']] = teams_dict.get(row.team)

        #load to Salary table
        salary = Salary(
            salary = row.sal, 
            teamid = teams_dict.get(row.team),
            playerid= row.playerid,
            last_salary = row.lastsal,
            war = row.war,        
            exp = row.exp,
            year = row.year) 

        session.add(salary)
        session.commit()  

#reload from db and save to csv
basebSal = pd.read_sql_query('select * from "Salary"', con=engine)
basebSal.to_csv('data/Salaries.csv', index=False)

### ETL Baseball Player Pitching

In [13]:
pitching = pd.read_csv('data/Pitching.csv')
pitching

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP
0,bechtge01,1871,1,PH1,,1,2,3,3,2,...,,,,0,,,42,,,
1,brainas01,1871,1,WS3,,12,15,30,30,30,...,,,,0,,,292,,,
2,fergubo01,1871,1,NY2,,0,0,1,0,0,...,,,,0,,,9,,,
3,fishech01,1871,1,RC1,,4,16,24,24,22,...,,,,0,,,257,,,
4,fleetfr01,1871,1,NY2,,0,1,1,1,1,...,,,,0,,,21,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44134,youngch03,2015,1,KCA,AL,11,6,34,18,0,...,0.0,5.0,0.0,0,500.0,3.0,44,4.0,2.0,
44135,zieglbr01,2015,1,ARI,NL,0,3,66,0,0,...,3.0,2.0,1.0,0,263.0,46.0,17,1.0,0.0,
44136,zimmejo02,2015,1,WAS,NL,13,10,33,33,0,...,3.0,2.0,8.0,1,831.0,0.0,89,8.0,2.0,
44137,zitoba01,2015,1,OAK,AL,0,0,3,2,0,...,0.0,0.0,0.0,0,37.0,1.0,8,0.0,0.0,


In [14]:
pitching.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44139 entries, 0 to 44138
Data columns (total 30 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   playerID  44139 non-null  object 
 1   yearID    44139 non-null  int64  
 2   stint     44139 non-null  int64  
 3   teamID    44139 non-null  object 
 4   lgID      44008 non-null  object 
 5   W         44139 non-null  int64  
 6   L         44139 non-null  int64  
 7   G         44139 non-null  int64  
 8   GS        44139 non-null  int64  
 9   CG        44139 non-null  int64  
 10  SHO       44139 non-null  int64  
 11  SV        44139 non-null  int64  
 12  IPouts    44138 non-null  float64
 13  H         44139 non-null  int64  
 14  ER        44139 non-null  int64  
 15  HR        44139 non-null  int64  
 16  BB        44139 non-null  int64  
 17  SO        44139 non-null  int64  
 18  BAOpp     42614 non-null  float64
 19  ERA       44049 non-null  float64
 20  IBB       29564 non-null  fl

In [15]:
pitching.columns

Index(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'W', 'L', 'G', 'GS',
       'CG', 'SHO', 'SV', 'IPouts', 'H', 'ER', 'HR', 'BB', 'SO', 'BAOpp',
       'ERA', 'IBB', 'WP', 'HBP', 'BK', 'BFP', 'GF', 'R', 'SH', 'SF', 'GIDP'],
      dtype='object')

In [16]:
pitching_yr=pitching[pitching['yearID']>=2000]
pitching_yr

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP
32900,abbotpa01,2000,1,SEA,AL,9,7,35,27,0,...,4.0,3.0,5.0,0,766.0,2.0,89,1.0,4.0,
32901,aceveju01,2000,1,MIL,NL,3,7,62,0,0,...,9.0,3.0,1.0,2,347.0,18.0,38,1.0,1.0,
32902,adamste01,2000,1,LAN,NL,6,9,66,0,0,...,0.0,5.0,0.0,0,369.0,18.0,42,3.0,0.0,
32903,aguilri01,2000,1,CHN,NL,1,2,54,0,0,...,2.0,1.0,4.0,0,210.0,44.0,28,1.0,0.0,
32904,aldresc01,2000,1,PHI,NL,1,3,23,0,0,...,0.0,1.0,1.0,0,95.0,5.0,14,1.0,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44134,youngch03,2015,1,KCA,AL,11,6,34,18,0,...,0.0,5.0,0.0,0,500.0,3.0,44,4.0,2.0,
44135,zieglbr01,2015,1,ARI,NL,0,3,66,0,0,...,3.0,2.0,1.0,0,263.0,46.0,17,1.0,0.0,
44136,zimmejo02,2015,1,WAS,NL,13,10,33,33,0,...,3.0,2.0,8.0,1,831.0,0.0,89,8.0,2.0,
44137,zitoba01,2015,1,OAK,AL,0,0,3,2,0,...,0.0,0.0,0.0,0,37.0,1.0,8,0.0,0.0,


In [17]:
pitching_final=pd.DataFrame(pitching_yr.drop(columns=['W', 'L', 'G', 'GS',
       'CG', 'SHO', 'SV', 'IPouts', 'H', 'ER', 'HR', 'BB','IBB', 'WP', 'HBP', 'BK', 'BFP', 'GF', 'R', 'SH', 'SF', 'GIDP']))
pitching_final

Unnamed: 0,playerID,yearID,stint,teamID,lgID,SO,BAOpp,ERA
32900,abbotpa01,2000,1,SEA,AL,100,0.243,4.22
32901,aceveju01,2000,1,MIL,NL,51,0.246,3.81
32902,adamste01,2000,1,LAN,NL,56,0.245,3.52
32903,aguilri01,2000,1,CHN,NL,38,0.251,4.91
32904,aldresc01,2000,1,PHI,NL,21,0.284,5.75
...,...,...,...,...,...,...,...,...
44134,youngch03,2015,1,KCA,AL,83,0.202,3.06
44135,zieglbr01,2015,1,ARI,NL,36,0.197,1.85
44136,zimmejo02,2015,1,WAS,NL,164,0.264,3.66
44137,zitoba01,2015,1,OAK,AL,2,0.387,10.29


In [18]:
pitching_final['team_id'] = -1

for index, row in pitching_final.iterrows():
    pitching_final.loc[index, ['team_id']] = teams_dict.get(row.teamID)
    
pitching_final.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,SO,BAOpp,ERA,team_id
32900,abbotpa01,2000,1,SEA,AL,100,0.243,4.22,14
32901,aceveju01,2000,1,MIL,NL,51,0.246,3.81,23
32902,adamste01,2000,1,LAN,NL,56,0.245,3.52,28
32903,aguilri01,2000,1,CHN,NL,38,0.251,4.91,21
32904,aldresc01,2000,1,PHI,NL,21,0.284,5.75,19


In [19]:
pitching_final.to_sql(name='Pitching', con=engine, if_exists='replace', index=False)

239

In [20]:
# load from csv if stored
if (os.path.isfile('data/PitchingNew.csv')):
    pitching_final = pd.read_csv('data/PitchingNew.csv')
else:
    # transfer
    pitching_final.to_sql(name='Pitching', con=engine, if_exists='replace', index=False) 
            
    #reload from db and save to csv
    pitching_final = pd.read_sql_query('select * from "Pitching"', con=engine)
    pitching_final.to_csv('data/PitchingNew.csv', index=False)

In [21]:
pitching_final.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,SO,BAOpp,ERA,team_id
0,abbotpa01,2000,1,SEA,AL,100,0.243,4.22,14
1,aceveju01,2000,1,MIL,NL,51,0.246,3.81,23
2,adamste01,2000,1,LAN,NL,56,0.245,3.52,28
3,aguilri01,2000,1,CHN,NL,38,0.251,4.91,21
4,aldresc01,2000,1,PHI,NL,21,0.284,5.75,19


In [22]:
# ETL Baseball Player Batting
Batting_Final = pd.read_csv('BaseballHits/Batting_Stats.csv')
Batting_Final['team_id'] = -1
Batting_Final

Unnamed: 0,playerid,year,team,Hits,Doubles,Triples,Home_Runs,RBI,team_id
0,aardsda01,2004,SFN,0,0,0,0,0,-1
1,aardsda01,2006,CHN,0,0,0,0,0,-1
2,aardsda01,2007,CHA,0,0,0,0,0,-1
3,aardsda01,2008,BOS,0,0,0,0,0,-1
4,aardsda01,2009,SEA,0,0,0,0,0,-1
...,...,...,...,...,...,...,...,...,...
22067,zumayjo01,2010,DET,0,0,0,0,0,-1
22068,zuninmi01,2013,SEA,37,5,0,5,14,-1
22069,zuninmi01,2014,SEA,87,20,2,22,60,-1
22070,zuninmi01,2015,SEA,61,11,0,11,28,-1


In [23]:
for index, row in Batting_Final.iterrows():
    Batting_Final.loc[index, ['team_id']] = teams_dict.get(row.team)
    
Batting_Final.head()

Unnamed: 0,playerid,year,team,Hits,Doubles,Triples,Home_Runs,RBI,team_id
0,aardsda01,2004,SFN,0,0,0,0,0,30
1,aardsda01,2006,CHN,0,0,0,0,0,21
2,aardsda01,2007,CHA,0,0,0,0,0,6
3,aardsda01,2008,BOS,0,0,0,0,0,2
4,aardsda01,2009,SEA,0,0,0,0,0,14


In [24]:
Batting_Final.to_sql(name='Batting', con=engine, if_exists='replace', index=False)

72

In [25]:
#playerDF
pitching = pd.read_csv('data/pitching_final.csv')
batting = pd.read_csv('BaseballHits/Batting_Stats.csv')
salary = pd.read_csv('data/0519_baseball_reference.csv')

In [26]:
bat = batting.rename(columns={'playerid': 'playerID'})
sal = salary.rename(columns={'playerid': 'playerID'})

In [27]:
players = pd.merge(bat, pitching, on='playerID')
player = pd.merge(sal, players, how='right', on='playerID')

In [28]:
df = pd.DataFrame({
    'PlayerID': player['playerID'],
    "Team": player['team_y'],
    'Year': player['year_y'],
    'Name': player['name']
})



FinalDF = df.dropna()
FinalDF = FinalDF.drop_duplicates()
FinalDF = FinalDF.set_index('PlayerID')
FinalDF.to_csv('data/FinalPlayerDF.csv')

In [29]:
Player_Final = pd.read_csv('data/FinalPlayerDF.csv')
Player_Final['team_id'] = -1
Player_Final

Unnamed: 0,PlayerID,Team,Year,Name,team_id
0,aardsda01,SFN,2004,David Aardsma,-1
1,aardsda01,CHN,2006,David Aardsma,-1
2,aardsda01,CHA,2007,David Aardsma,-1
3,aardsda01,BOS,2008,David Aardsma,-1
4,aardsda01,SEA,2009,David Aardsma,-1
...,...,...,...,...,...
10051,zumayjo01,DET,2006,Joel Zumaya,-1
10052,zumayjo01,DET,2007,Joel Zumaya,-1
10053,zumayjo01,DET,2008,Joel Zumaya,-1
10054,zumayjo01,DET,2009,Joel Zumaya,-1


In [30]:
for index, row in Player_Final.iterrows():
    Player_Final.loc[index, ['team_id']] = teams_dict.get(row.Team)
    
Player_Final.head()

Unnamed: 0,PlayerID,Team,Year,Name,team_id
0,aardsda01,SFN,2004,David Aardsma,30
1,aardsda01,CHN,2006,David Aardsma,21
2,aardsda01,CHA,2007,David Aardsma,6
3,aardsda01,BOS,2008,David Aardsma,2
4,aardsda01,SEA,2009,David Aardsma,14


In [31]:
Player_Final.to_sql(name='Player', con=engine, if_exists='replace', index=False)

56

In [32]:
# ETL Baseball Player Cards

In [33]:
session.close()
engine.dispose()