In [4]:
import math
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv('../data/lahman/mlb_data/Teams.csv')

In [6]:
df.columns

Index(['yearID', 'lgID', 'teamID', 'franchID', 'divID', 'Rank', 'G', 'Ghome',
       'W', 'L', 'DivWin', 'WCWin', 'LgWin', 'WSWin', 'R', 'AB', 'H', '2B',
       '3B', 'HR', 'BB', 'SO', 'SB', 'CS', 'HBP', 'SF', 'RA', 'ER', 'ERA',
       'CG', 'SHO', 'SV', 'IPouts', 'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP', 'FP',
       'name', 'park', 'attendance', 'BPF', 'PPF', 'teamIDBR',
       'teamIDlahman45', 'teamIDretro'],
      dtype='object')

In [7]:
df.head()

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,...,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
0,1919,AL,BOS,BOS,,6,138,66,66,71,...,118,0.975,Boston Red Sox,Fenway Park I,417291,94,94,BOS,BOS,BOS
1,1919,NL,BRO,LAD,,5,141,70,69,71,...,84,0.963,Brooklyn Robins,Ebbets Field,360721,103,103,BRO,BRO,BRO
2,1919,NL,BSN,ATL,,6,140,68,57,82,...,111,0.966,Boston Braves,Braves Field,167401,95,98,BSN,BSN,BSN
3,1919,AL,CHA,CHW,,1,140,70,88,52,...,116,0.969,Chicago White Sox,Comiskey Park,627186,100,99,CHW,CHA,CHA
4,1919,NL,CHN,CHC,,3,140,71,75,65,...,87,0.969,Chicago Cubs,Wrigley Field,424430,100,99,CHC,CHN,CHN


In [8]:
df = df.drop(columns=['teamIDlahman45', 'teamIDBR'])

<p>The first step is to ensure we're only using one ID per team. It would be best to just use Retrosheet's values, so our first step is to see where teamID differs from teamIDretro. Once we come up with a way to fix these differences, we'll want to write it as a script that we can use elsewhere - for example, in the batting table where we're using the regular teamID values.</p>

In [9]:
df[(df['teamID'] != df['teamIDretro'])][['yearID', 'teamID', 'teamIDretro', 'name']]

Unnamed: 0,yearID,teamID,teamIDretro,name
551,1953,ML1,MLN,Milwaukee Braves
568,1954,ML1,MLN,Milwaukee Braves
585,1955,ML1,MLN,Milwaukee Braves
601,1956,ML1,MLN,Milwaukee Braves
617,1957,ML1,MLN,Milwaukee Braves
633,1958,ML1,MLN,Milwaukee Braves
649,1959,ML1,MLN,Milwaukee Braves
665,1960,ML1,MLN,Milwaukee Braves
683,1961,ML1,MLN,Milwaukee Braves
702,1962,ML1,MLN,Milwaukee Braves


<p>So clearly we have three teams where the IDs differ. We need to ask a few questions though:</p>
<ul>
    <li>Do they differ on those teams every time? We can't just take that for granted.</li>
</ul>

In [10]:
df[df['franchID'] == 'ANA']['teamID'].value_counts()

CAL    32
LAA    18
ANA     8
Name: teamID, dtype: int64

In [11]:
df[(df['teamID'] != df['teamIDretro'])][['teamID', 'teamIDretro', 'name']].shape[0]

55

In [12]:
df[(df['teamID'] == 'ML1')].shape[0] + df[(df['teamID'] == 'ML4')].shape[0] + df[(df['teamID'] == 'LAA')].shape[0]

59

<p>Unfortunately we have a disparity of 4, so we need to find out where that is.</p>

In [13]:
df[(df['teamID'] == 'ML1') & (df['teamID'] == df['teamIDretro'])]

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,...,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDretro


In [14]:
df[(df['teamID'] == 'ML4') & (df['teamID'] == df['teamIDretro'])]

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,...,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDretro


In [15]:
df[(df['teamID'] == 'LAA') & (df['teamID'] == df['teamIDretro'])]

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,...,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDretro
680,1961,AL,LAA,ANA,,8,162,82,70,91,...,973,192,154,0.969,Los Angeles Angels,Wrigley Field (LA),603510,111,112,LAA
699,1962,AL,LAA,ANA,,3,162,81,86,76,...,858,175,153,0.972,Los Angeles Angels,Dodger Stadium,1144063,97,97,LAA
719,1963,AL,LAA,ANA,,9,161,81,70,91,...,889,163,155,0.974,Los Angeles Angels,Dodger Stadium,821015,94,94,LAA
739,1964,AL,LAA,ANA,,5,162,81,82,80,...,965,138,168,0.978,Los Angeles Angels,Dodger Stadium,760439,90,90,LAA


In [16]:
df[(df['teamID'] == 'LAA') & (df['teamID'] != df['teamIDretro'])]

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,...,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDretro
1801,2005,AL,LAA,ANA,W,1,162,81,95,67,...,1126,87,139,0.986,Los Angeles Angels of Anaheim,Angel Stadium,3404686,98,97,ANA
1831,2006,AL,LAA,ANA,W,2,162,81,89,73,...,1164,124,154,0.979,Los Angeles Angels of Anaheim,Angel Stadium,3406790,100,100,ANA
1861,2007,AL,LAA,ANA,W,1,162,81,94,68,...,1156,101,154,0.983,Los Angeles Angels of Anaheim,Angel Stadium,3365632,101,100,ANA
1891,2008,AL,LAA,ANA,W,1,162,81,100,62,...,1106,91,159,0.985,Los Angeles Angels of Anaheim,Angel Stadium,3336747,103,102,ANA
1921,2009,AL,LAA,ANA,W,1,162,81,97,65,...,1062,85,174,0.986,Los Angeles Angels of Anaheim,Angel Stadium,3240386,99,98,ANA
1951,2010,AL,LAA,ANA,W,3,162,81,80,82,...,1130,113,116,0.981,Los Angeles Angels of Anaheim,Angel Stadium,3250816,98,98,ANA
1981,2011,AL,LAA,ANA,W,2,162,81,86,76,...,1058,93,157,0.985,Los Angeles Angels of Anaheim,Angel Stadium,3166321,93,93,ANA
2010,2012,AL,LAA,ANA,W,3,162,81,89,73,...,1157,98,141,0.984,Los Angeles Angels of Anaheim,Angel Stadium of Anaheim,3061770,92,92,ANA
2040,2013,AL,LAA,ANA,W,3,162,81,78,84,...,1200,112,135,0.981,Los Angeles Angels of Anaheim,Angel Stadium of Anaheim,3019505,94,94,ANA
2070,2014,AL,LAA,ANA,W,1,162,81,98,64,...,1342,83,127,0.986,Los Angeles Angels of Anaheim,Angel Stadium of Anaheim,3095935,96,95,ANA


In [17]:
df['franchID'].unique()

array(['BOS', 'LAD', 'ATL', 'CHW', 'CHC', 'CIN', 'CLE', 'DET', 'SFG',
       'NYY', 'OAK', 'PHI', 'PIT', 'BAL', 'STL', 'MIN', 'ANA', 'TEX',
       'HOU', 'NYM', 'KCR', 'WSN', 'SDP', 'MIL', 'SEA', 'TOR', 'COL',
       'FLA', 'ARI', 'TBD'], dtype=object)

In [18]:
df[(df['franchID'].isnull())]

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,...,SOA,E,DP,FP,name,park,attendance,BPF,PPF,teamIDretro


In [19]:
df['franchID'].nunique()

30

<p>It looks like it will be easiest to just use the franchise ID - they stay consistent throughout and there are only ever 30 max. We'll need a way to map to these values from an external script so we can use it in other files.</p>