# Baseball ETL

### Imports

In [1]:
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect
from config import pg_pwd
#from configAB import username, password
import psycopg2
import os

from bs4 import BeautifulSoup as bs
import requests

import pandas as pd
import numpy as np

### Initializations

In [2]:
#db initializations
engine = create_engine(f'postgresql://{pg_pwd}/MoneyBall')

Base = automap_base()
Base.prepare(autoload_with=engine)
session = Session(engine)

### ETL Baseball Teams and Cities

In [3]:
# load from csv if stored
if (os.path.isfile('data/Teams.csv')):
    citiesDF = pd.read_csv('data/Cities.csv')
    teamsDF = pd.read_csv('data/Teams.csv')
else:
    # extract
    url = 'https://www.worldatlas.com/articles/mlb-teams-and-their-cities.html'
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    result = soup.find('tbody')
    rows = result.find_all('tr')
    
    # transfer
    City = Base.classes.City 
    Team = Base.classes.Team 
    
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        
        teamcol = cols[1]
        citystatecol = cols[2].split(', ')
        citycol = citystatecol[0]
        statecol = citystatecol[1]

        #load to City table
        city = City(cityname=citycol, statename=statecol)  
        session.add(city)
        session.commit()    
        session.refresh(city)
        
        #load to Team table
        team = Team(teamname=teamcol, cityid=city.cityid)  
        session.add(team)
        session.commit()    

    #reload from db and save to csv
    citiesDF = pd.read_sql_query('select cityid, cityname, statename from "City"', con=engine)
    citiesDF.to_csv('data/Cities.csv', index=False)
    
    teamsDF = pd.read_sql_query('select teamid, teamname, cityid from "Team"', con=engine)
    teamsDF.to_csv('data/Teams.csv', index=False)

#### Team matching dictionary

In [4]:
#put teams in dictionary for lookup
teams_dict = dict(zip(teamsDF.teamname, teamsDF.teamid))

In [5]:
# additions from payroll set
teams_dict.update({'Tampa Bay Devil Rays': 4})
teams_dict.update({'Anaheim Angels': 12})
teams_dict.update({'Los Angeles Angels of Anaheim': 12})
teams_dict.update({'Montreal Expos': 20})
teams_dict.update({'Florida Marlins': 17})

In [6]:
# additions from salary set
teams_dict.update({'ARI': 26})
teams_dict.update({'ATL': 16})
teams_dict.update({'BAL': 1}) 
teams_dict.update({'BOS': 2}) 
teams_dict.update({'CHC': 21}) 
teams_dict.update({'CHW': 6}) 
teams_dict.update({'CIN': 22}) 
teams_dict.update({'CLE': 7}) 
teams_dict.update({'COL': 27})
teams_dict.update({'DET': 8}) 
teams_dict.update({'HOU': 11}) 
teams_dict.update({'KCR': 9}) 
teams_dict.update({'LAA': 12}) 
teams_dict.update({'LAD': 28}) 
teams_dict.update({'MIA': 17}) 
teams_dict.update({'MIL': 23}) 
teams_dict.update({'MIN': 10}) 
teams_dict.update({'NYM': 18})
teams_dict.update({'NYY': 3}) 
teams_dict.update({'OAK': 13}) 
teams_dict.update({'PHI': 19}) 
teams_dict.update({'PIT': 24}) 
teams_dict.update({'SDP': 29}) 
teams_dict.update({'SEA': 14}) 
teams_dict.update({'SFG': 30}) 
teams_dict.update({'STL': 25}) 
teams_dict.update({'TBR': 4})
teams_dict.update({'TEX': 15}) 
teams_dict.update({'TOR': 5}) 
teams_dict.update({'WSN': 20})

### ETL Baseball Payrolls

In [7]:
# load from csv if stored
if (os.path.isfile('data/Payrolls.csv')):
    mlb_payrollDF = pd.read_csv('data/Payrolls.csv')
else:
    # extract
    mlb_payrollDF = pd.DataFrame()

    # combine sets into payroll dataframe
    for yr in range(2000, 2016):
        set_filepath = 'data/payroll_files/mlb_payroll_' + str(yr) + '.csv'
        DF = pd.read_csv(set_filepath)
        DF["year"] = yr
        DF["teamid"] = -1
        DF = DF.drop(columns=['roster'])
        DF.columns=['team_name', 'league', 'division', 'payroll', 'w', 'l', 'wpct', 
                    'rnk', 'lgrk', 'mlbrk', 'last_payroll','top_salary', 'year', 'teamid']
        mlb_payrollDF = pd.concat([mlb_payrollDF, DF])
        
    # transfer, and identify missing team names
    Payroll = Base.classes.Payroll 
    Team = Base.classes.Team 

    for index, row in mlb_payrollDF.iterrows():
        if teams_dict.get(row.team_name, -1) == -1:
            print(row.team_name)
        else:
            mlb_payrollDF.loc[index, ['teamid']] = teams_dict.get(row.team_name)

            #load to Payroll table
            payroll = Payroll(
                teamid = teams_dict.get(row.team_name),
                payroll = row.payroll.replace(',', ''),
                w = row.w, 
                l = row.l, 
                wpct = row.wpct, 
                rnk = row.rnk, 
                lgrk = row.lgrk, 
                mlbrk = row.mlbrk, 
                last_payroll = row.last_payroll.replace(',', ''),
                top_salary = row.top_salary, 
                year = row.year) 

            session.add(payroll)
            session.commit()    
            
            #update Team table with league, division
            session.query(Team).filter(Team.teamid == teams_dict.get(row.team_name)).update(
                {Team.league:row.league}, synchronize_session = False)

            session.query(Team).filter(Team.teamid == teams_dict.get(row.team_name)).update(
                {Team.division:row.division}, synchronize_session = False)

            session.commit()    

    #reload from db and save to csv
    payrollsDF = pd.read_sql_query('select * from "Payroll"', con=engine)
    payrollsDF.to_csv('data/Payrolls.csv', index=False)

The printed teams have had name changes or have moved since 2000.  Add mapping entries for those team to get proper ids.

- Tampa Bay Devil Rays -> Tampa Bay Rays (4)
- Anaheim Angels -> Los Angeles Angels (12)
- Montreal Expos -> Washington Nationals (20)
- Florida Marlins ->  Maimi Marlins (17)
- Los Angeles Angels of Anaheim -> Los Angeles Angels (12)

In [8]:
break
mlb_payrollDF.head()

SyntaxError: 'break' outside loop (2521747283.py, line 1)

In [None]:
# ETL Baseball Players

In [None]:
# ETL Baseball Player Salaries

In [None]:
basebSal= pd.read_csv('data/0519_baseball_reference.csv')
basebSal.head() 

In [None]:
    # transfer, and identify missing salaries
    Salary = Base.classes.Salary 

    for index, row in basebSal.iterrows():
        if teams_dict.get(row.team, -1) == -1:
            print(row.team)
        else:
            basebSal.loc[index, ['teamid']] = teams_dict.get(row.team)

            #load to Salary table
            salary = Salary(
                teamid = teams_dict.get(row.team),
                salary = row.sal, 
                last_salary = row.lastsal,
                year = row.year,
                war = row.war,        
                exp = row.exp,
                playerid= row.playerid) 

            session.add(salary)
            session.commit()  

In [None]:
# ETL Baseball Player Batting

In [None]:
# ETL Baseball Player Pitching
#engine = create_engine(f'postgresql://{username}:{password}@localhost:5432/MoneyBall')
#Base = automap_base()
#Base.prepare(autoload_with=engine)
#session = Session(engine)

In [None]:
pitch_teams = {1: ['BAL'], 2: ['BOS'], 3: ['NYA'], 4: ['TBA'], 5: ['TOR'], 6: ['CHA'], 7: ['CLE'], 8: ['DET'], 9: ['KCA'], 
             10: ['MIN'], 11: ['HOU'], 12: ['LAA', 'ANA'], 13: ['OAK'], 14: ['SEA'], 15: ['TEX'], 16: ['ATL'], 
             17: ['MIA', 'FLO'], 18: ['NYN'], 19: ['PHI'], 20: ['WAS', 'MON'], 21: ['CHN'], 22: ['CIN'], 23: ['MIL'], 
             24: ['PIT'], 25: ['SLN'], 26: ['ARI'], 27: ['COL'], 28: ['LAN'], 29: ['SDN'], 30: ['SFN']}

In [None]:
pitching = pd.read_csv('data/Pitching.csv')
pitching

In [None]:
pitching.info()

In [None]:
pitching.columns

In [None]:
pitching_yr=pitching[pitching['yearID']>=2000]
pitching_yr

In [None]:
pitching_final=pd.DataFrame(pitching_yr.drop(columns=['W', 'L', 'G', 'GS',
       'CG', 'SHO', 'SV', 'IPouts', 'H', 'ER', 'HR', 'BB','IBB', 'WP', 'HBP', 'BK', 'BFP', 'GF', 'R', 'SH', 'SF', 'GIDP']))
pitching_final

In [None]:
pitching_final = pitching_final.set_index('playerID')
pitching_final.head(7)

In [None]:
pitching_final.to_csv('data/pitching_final.csv')

In [None]:
pitching_og = pd.read_sql(sql = 'SELECT * FROM pitching_og;', con = engine)
pitching_og.head()


In [None]:
pitching_final = pd.read_sql(sql = 'SELECT * FROM pitching_final;', con = engine)
pitching_final.head()

In [None]:
# ETL Baseball Player Cards

In [None]:
# ETL Baseball Team Patroll

In [None]:
session.close()
engine.dispose()