In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip install bs4 
from bs4 import BeautifulSoup
!pip install lxml
import requests
from datetime import datetime
import seaborn as sb
from time import sleep
import ast

In [None]:
def correctDateTime(date):
  d = date.split(' ', 1)[1:]
  substr1 = d[0].split()[0]
  substr2 = d[0].split()[1]
  substr2 = substr2.replace(',', '')
  gameday = substr2 + " " + substr1 + ", " + d[0].split()[-1]
  day = datetime.strptime(gameday, "%d %B, %Y").date()
  return day

# find most recent prior non-null cell (most recent non-off day)
def findMostRecentGameIndex(nullList, index, teamName):
  i = 1
  while nullList.iloc[index-i][teamName] == True:
    i += 1
  return index-i

def doubleHeaderWinOrLoss(gameResult, dataframe, index, teamName):
  if gameResult[0] == 'W':
    dataframe.iloc[index][teamName]['win/loss(1)'] = 1
  elif gameResult[0] == 'L':
    dataframe.iloc[index][teamName]['win/loss(1)'] = 0
  else: # (tie)
    return 'error'
  substr = gameResult[2:]
  runsFor = ''
  runsAgainst = ''
  location = 0
  for i in range(len(substr)):
    runsFor += substr[i]
    if substr[i+1] == '-':
      location = i+1
      break
  for i in range(location+1, len(substr)):
    runsAgainst += substr[i]
  dataframe.iloc[index][teamName]['runs_for(1)'] = runsFor
  dataframe.iloc[index][teamName]['runs_against(1)'] = runsAgainst
  return dataframe

Extract Results of Every MLB Game for the Past 20 Years

In [None]:
# Web Scraping
years = np.linspace(2000, 2019, 20)
years_strings = {}
for i in range(len(years)):
  years_strings[str(int(years[i]))] = ''
soup = ['']*20
html = ['']*20
soup = ['']*20

for i in range(20):
  baseball_reference = 'https://www.baseball-reference.com/leagues/MLB/' + str(int(years[i])) + '-schedule.shtml'
  html[i] = requests.get(baseball_reference).text
  soup[i] = BeautifulSoup(html[i], 'lxml')

In [None]:
# list to hold the names of all 30 MLB teams
columns = []
# loop through the schedule to find all MLB teams
schedule = soup[-1].find('div', class_='section_content')
gamedays = schedule.find_all('div')
# iterate through gamedays 
for gameday in gamedays:
  games = gameday.find_all('p', class_='game')
  for game in games:
    awayTeam = game.find('a').text
    homeTeam = game.find('strong').text
    homeTeam = homeTeam.partition('\n')[0]
    if homeTeam[0] == ' ':
      homeTeam = homeTeam[1:]
    if awayTeam not in columns:
      columns.append(awayTeam)
    if homeTeam not in columns:
      columns.append(homeTeam)
  if len(columns) == 30:
    break

gamedates = []
for i in range(20):
  schedule = soup[i].find('div', class_='section_content')
  gamedays = schedule.find_all('div')
  for gameday in gamedays: 
    date = correctDateTime(gameday.find('h3').text)
    if date not in gamedates:
      gamedates.append(date)

In [None]:
# account for team name changes
def correctTeamName(teamName):
  if teamName == 'Montreal Expos':
    teamName = 'Washington Nationals'
  elif teamName == 'Tampa Bay Devil Rays':
    teamName = 'Tampa Bay Rays'
  elif teamName == 'Anaheim Angels':
    teamName = 'Los Angeles Angels'
  elif teamName == 'LA Angels of Anaheim':
    teamName = 'Los Angeles Angels'
  elif teamName == 'Florida Marlins':
    teamName = 'Miami Marlins'
  return teamName

In [None]:
# create dataframe with all 30 MLB teams in the columns and game dates as row indices
gameResults = pd.DataFrame(columns=columns, index=gamedates)

for i in range(20):
  schedule = soup[i].find('div', class_='section_content')
  gamedays = schedule.find_all('div')
  for gameday in gamedays:
    date = correctDateTime(gameday.find('h3').text)
    games = gameday.find_all('p', class_='game')
    visited = []
    for game in games:
      # results of each game
      teams = game.find_all('a')
      awayTeam = teams[0].text
      homeTeam = teams[1].text
      awayRuns = game.text.splitlines()[2]
      homeRuns = game.text.splitlines()[5]
      
      if homeTeam in visited or awayTeam in visited:
        continue
      
      else:
        visited.append(homeTeam)
        visited.append(awayTeam)
        
        homeTeam = correctTeamName(homeTeam)
        awayTeam = correctTeamName(awayTeam)

        for i in awayRuns:
          awayRuns = awayRuns.replace('(', '')
          awayRuns = awayRuns.replace(' ', '')
        awayRuns = awayRuns[:-1]
        for i in homeRuns:
          homeRuns = homeRuns.replace('(', '')
          homeRuns = homeRuns.replace(' ', '')
        homeRuns = homeRuns[:-1]
        awayRuns = int(awayRuns)
        homeRuns = int(homeRuns)

        if awayRuns > homeRuns:
          winnerList = [awayRuns, homeRuns]
          # add +1 label reflecting a win
          winnerList.append(1)
          gameResults.loc[date][awayTeam] = winnerList
          loserList = [homeRuns, awayRuns]
          # add a 0 label reflecting a loss
          loserList.append(0)
          gameResults.loc[date][homeTeam] = loserList

        elif homeRuns > awayRuns:
          winnerList = [homeRuns, awayRuns]
          winnerList.append(1)
          gameResults.loc[date][homeTeam] = winnerList
          loserList = [awayRuns, homeRuns]
          loserList.append(0)
          gameResults.loc[date][awayTeam] = loserList
        
        if homeRuns != awayRuns:
          gameResults.loc[date][homeTeam] = {'runs_for': gameResults.loc[date, homeTeam][0], 'runs_against': gameResults.loc[date, homeTeam][1], "win/loss": gameResults.loc[date, homeTeam][2]}
          gameResults.loc[date][awayTeam] = {'runs_for': gameResults.loc[date, awayTeam][0], 'runs_against': gameResults.loc[date, awayTeam][1], "win/loss": gameResults.loc[date, awayTeam][2]} 
          gameResults.loc[date][homeTeam]['home/away'] = 1
          gameResults.loc[date][awayTeam]['home/away'] = 0
          gameResults.loc[date][homeTeam]['opponent'] = awayTeam

gameResults.tail()

Unnamed: 0,Seattle Mariners,Pittsburgh Pirates,Cincinnati Reds,Chicago White Sox,Kansas City Royals,Arizona D'Backs,Los Angeles Dodgers,Colorado Rockies,St. Louis Cardinals,Milwaukee Brewers,Cleveland Indians,Minnesota Twins,Baltimore Orioles,New York Yankees,Los Angeles Angels,Oakland Athletics,Atlanta Braves,Philadelphia Phillies,San Francisco Giants,San Diego Padres,Boston Red Sox,Houston Astros,Chicago Cubs,Detroit Tigers,New York Mets,Tampa Bay Rays,Toronto Blue Jays,Miami Marlins,Texas Rangers,Washington Nationals
2019-09-25,"{'runs_for': 0, 'runs_against': 3, 'win/loss':...","{'runs_for': 4, 'runs_against': 2, 'win/loss':...","{'runs_for': 2, 'runs_against': 9, 'win/loss':...","{'runs_for': 8, 'runs_against': 3, 'win/loss':...","{'runs_for': 2, 'runs_against': 10, 'win/loss'...","{'runs_for': 9, 'runs_against': 7, 'win/loss':...","{'runs_for': 6, 'runs_against': 4, 'win/loss':...","{'runs_for': 1, 'runs_against': 2, 'win/loss':...","{'runs_for': 7, 'runs_against': 9, 'win/loss':...","{'runs_for': 9, 'runs_against': 2, 'win/loss':...","{'runs_for': 3, 'runs_against': 8, 'win/loss':...","{'runs_for': 5, 'runs_against': 1, 'win/loss':...","{'runs_for': 2, 'runs_against': 3, 'win/loss':...","{'runs_for': 0, 'runs_against': 4, 'win/loss':...","{'runs_for': 2, 'runs_against': 3, 'win/loss':...","{'runs_for': 3, 'runs_against': 2, 'win/loss':...","{'runs_for': 10, 'runs_against': 2, 'win/loss'...","{'runs_for': 2, 'runs_against': 5, 'win/loss':...","{'runs_for': 2, 'runs_against': 1, 'win/loss':...","{'runs_for': 4, 'runs_against': 6, 'win/loss':...","{'runs_for': 10, 'runs_against': 3, 'win/loss'...","{'runs_for': 3, 'runs_against': 0, 'win/loss':...","{'runs_for': 2, 'runs_against': 4, 'win/loss':...","{'runs_for': 1, 'runs_against': 5, 'win/loss':...","{'runs_for': 10, 'runs_against': 3, 'win/loss'...","{'runs_for': 4, 'runs_against': 0, 'win/loss':...","{'runs_for': 3, 'runs_against': 2, 'win/loss':...","{'runs_for': 3, 'runs_against': 10, 'win/loss'...","{'runs_for': 3, 'runs_against': 10, 'win/loss'...","{'runs_for': 5, 'runs_against': 2, 'win/loss':..."
2019-09-26,"{'runs_for': 1, 'runs_against': 3, 'win/loss':...","{'runs_for': 9, 'runs_against': 5, 'win/loss':...","{'runs_for': 3, 'runs_against': 5, 'win/loss':...","{'runs_for': 8, 'runs_against': 0, 'win/loss':...",,,"{'runs_for': 1, 'runs_against': 0, 'win/loss':...","{'runs_for': 3, 'runs_against': 8, 'win/loss':...",,"{'runs_for': 5, 'runs_against': 3, 'win/loss':...","{'runs_for': 0, 'runs_against': 8, 'win/loss':...","{'runs_for': 10, 'runs_against': 4, 'win/loss'...",,,"{'runs_for': 4, 'runs_against': 3, 'win/loss':...","{'runs_for': 3, 'runs_against': 1, 'win/loss':...",,"{'runs_for': 3, 'runs_against': 6, 'win/loss':...","{'runs_for': 8, 'runs_against': 3, 'win/loss':...","{'runs_for': 0, 'runs_against': 1, 'win/loss':...","{'runs_for': 5, 'runs_against': 7, 'win/loss':...","{'runs_for': 3, 'runs_against': 4, 'win/loss':...","{'runs_for': 5, 'runs_against': 9, 'win/loss':...","{'runs_for': 4, 'runs_against': 10, 'win/loss'...","{'runs_for': 2, 'runs_against': 4, 'win/loss':...",,,"{'runs_for': 4, 'runs_against': 2, 'win/loss':...","{'runs_for': 7, 'runs_against': 5, 'win/loss':...","{'runs_for': 6, 'runs_against': 3, 'win/loss':..."
2019-09-27,"{'runs_for': 4, 'runs_against': 3, 'win/loss':...","{'runs_for': 6, 'runs_against': 5, 'win/loss':...","{'runs_for': 5, 'runs_against': 6, 'win/loss':...",,"{'runs_for': 2, 'runs_against': 6, 'win/loss':...","{'runs_for': 6, 'runs_against': 3, 'win/loss':...","{'runs_for': 9, 'runs_against': 2, 'win/loss':...","{'runs_for': 11, 'runs_against': 7, 'win/loss'...","{'runs_for': 2, 'runs_against': 8, 'win/loss':...","{'runs_for': 7, 'runs_against': 11, 'win/loss'...","{'runs_for': 2, 'runs_against': 8, 'win/loss':...","{'runs_for': 6, 'runs_against': 2, 'win/loss':...","{'runs_for': 4, 'runs_against': 1, 'win/loss':...","{'runs_for': 14, 'runs_against': 7, 'win/loss'...","{'runs_for': 0, 'runs_against': 4, 'win/loss':...","{'runs_for': 3, 'runs_against': 4, 'win/loss':...","{'runs_for': 2, 'runs_against': 4, 'win/loss':...","{'runs_for': 5, 'runs_against': 4, 'win/loss':...","{'runs_for': 2, 'runs_against': 9, 'win/loss':...","{'runs_for': 3, 'runs_against': 6, 'win/loss':...","{'runs_for': 1, 'runs_against': 4, 'win/loss':...","{'runs_for': 4, 'runs_against': 0, 'win/loss':...","{'runs_for': 8, 'runs_against': 2, 'win/loss':...",,"{'runs_for': 4, 'runs_against': 2, 'win/loss':...","{'runs_for': 6, 'runs_against': 2, 'win/loss':...","{'runs_for': 2, 'runs_against': 6, 'win/loss':...","{'runs_for': 4, 'runs_against': 5, 'win/loss':...","{'runs_for': 7, 'runs_against': 14, 'win/loss'...","{'runs_for': 8, 'runs_against': 2, 'win/loss':..."
2019-09-28,"{'runs_for': 0, 'runs_against': 1, 'win/loss':...","{'runs_for': 2, 'runs_against': 4, 'win/loss':...","{'runs_for': 4, 'runs_against': 2, 'win/loss':...","{'runs_for': 7, 'runs_against': 1, 'win/loss':...","{'runs_for': 3, 'runs_against': 4, 'win/loss':...","{'runs_for': 6, 'runs_against': 5, 'win/loss':...","{'runs_for': 2, 'runs_against': 0, 'win/loss':...","{'runs_for': 3, 'runs_against': 2, 'win/loss':...","{'runs_for': 6, 'runs_against': 8, 'win/loss':...","{'runs_for': 2, 'runs_against': 3, 'win/loss':...","{'runs_for': 7, 'runs_against': 10, 'win/loss'...","{'runs_for': 4, 'runs_against': 3, 'win/loss':...","{'runs_for': 9, 'runs_against': 4, 'win/loss':...","{'runs_for': 4, 'runs_against': 9, 'win/loss':...","{'runs_for': 3, 'runs_against': 6, 'win/loss':...","{'runs_for': 1, 'runs_against': 0, 'win/loss':...","{'runs_for': 0, 'runs_against': 3, 'win/loss':...","{'runs_for': 9, 'runs_against': 3, 'win/loss':...","{'runs_for': 0, 'runs_against': 2, 'win/loss':...","{'runs_for': 5, 'runs_against': 6, 'win/loss':...","{'runs_for': 4, 'runs_against': 9, 'win/loss':...","{'runs_for': 6, 'runs_against': 3, 'win/loss':...","{'runs_for': 8, 'runs_against': 6, 'win/loss':...","{'runs_for': 1, 'runs_against': 7, 'win/loss':...","{'runs_for': 3, 'runs_against': 0, 'win/loss':...","{'runs_for': 1, 'runs_against': 4, 'win/loss':...","{'runs_for': 4, 'runs_against': 1, 'win/loss':...","{'runs_for': 3, 'runs_against': 9, 'win/loss':...","{'runs_for': 9, 'runs_against': 4, 'win/loss':...","{'runs_for': 10, 'runs_against': 7, 'win/loss'..."
2019-09-29,"{'runs_for': 3, 'runs_against': 1, 'win/loss':...","{'runs_for': 1, 'runs_against': 3, 'win/loss':...","{'runs_for': 3, 'runs_against': 1, 'win/loss':...","{'runs_for': 5, 'runs_against': 3, 'win/loss':...","{'runs_for': 5, 'runs_against': 4, 'win/loss':...","{'runs_for': 1, 'runs_against': 0, 'win/loss':...","{'runs_for': 9, 'runs_against': 0, 'win/loss':...","{'runs_for': 4, 'runs_against': 3, 'win/loss':...","{'runs_for': 9, 'runs_against': 0, 'win/loss':...","{'runs_for': 3, 'runs_against': 4, 'win/loss':...","{'runs_for': 2, 'runs_against': 8, 'win/loss':...","{'runs_for': 4, 'runs_against': 5, 'win/loss':...","{'runs_for': 4, 'runs_against': 5, 'win/loss':...","{'runs_for': 1, 'runs_against': 6, 'win/loss':...","{'runs_for': 5, 'runs_against': 8, 'win/loss':...","{'runs_for': 1, 'runs_against': 3, 'win/loss':...","{'runs_for': 6, 'runs_against': 7, 'win/loss':...","{'runs_for': 3, 'runs_against': 4, 'win/loss':...","{'runs_for': 0, 'runs_against': 9, 'win/loss':...","{'runs_for': 0, 'runs_against': 1, 'win/loss':...","{'runs_for': 5, 'runs_against': 4, 'win/loss':...","{'runs_for': 8, 'runs_against': 5, 'win/loss':...","{'runs_for': 0, 'runs_against': 9, 'win/loss':...","{'runs_for': 3, 'runs_against': 5, 'win/loss':...","{'runs_for': 7, 'runs_against': 6, 'win/loss':...","{'runs_for': 3, 'runs_against': 8, 'win/loss':...","{'runs_for': 8, 'runs_against': 3, 'win/loss':...","{'runs_for': 4, 'runs_against': 3, 'win/loss':...","{'runs_for': 6, 'runs_against': 1, 'win/loss':...","{'runs_for': 8, 'runs_against': 2, 'win/loss':..."


In [None]:
# create a list to hold indices of the beginning of each new season
dates = gameResults.index.tolist()
new_season = [0]
for i in range(1, len(dates)):
  if dates[i].year > dates[i-1].year:
    new_season.append(i)
print(new_season)

[0, 181, 362, 542, 722, 904, 1084, 1264, 1445, 1629, 1810, 1990, 2169, 2350, 2530, 2711, 2890, 3069, 3248, 3432]


In [None]:
gameResults = data.copy()

Extract At-bats, Hits, Walks, and Other Relevent Stats From Each Game

In [None]:
nullList = gameResults.isnull()
for i in range(len(gameResults)):
  for j in range(len(gameResults.columns)):
    if nullList.iloc[i][j] == False:
      gameResults.iloc[i][j]['double_header'] = False

for s in range(20):
  unvisited = columns.copy()
  schedule = soup[s].find('div', class_='section_content')
  gamedays = schedule.find_all('div')
  for gameday in gamedays:
    games = gameday.find_all('p', class_='game')
    for game in games: 
      teams = game.find_all('a')
      awayTeam = teams[0].text
      homeTeam = teams[1].text
      awayRuns = game.text.splitlines()[2]
      homeRuns = game.text.splitlines()[5]
    
      # account for team name changes
      homeTeam = correctTeamName(homeTeam)
      awayTeam = correctTeamName(awayTeam)

      if homeTeam in unvisited:
        em = game.find('em')
        boxScore = em.find('a')
        boxScore = boxScore.get('href')
        boxScore = "http://baseball-reference.com" + boxScore
        while True:
          try:
            boxLink = requests.get(boxScore).text
            break
          except:
            sleep(10)
            continue
        boxLinkSoup = BeautifulSoup(boxLink, 'lxml')
        link = boxLinkSoup.find('div', {'id': 'bottom_nav_container'})
        links = link.find_all('li')
        homeTeamLink = links[1]
        awayTeamLink = links[2]

        # go to home team's schedule
        homeTeamLink = homeTeamLink.find('a')
        homeTeamLink = homeTeamLink.get('href')
        homeTeamLink = "http://baseball-reference.com" + homeTeamLink
        while True:
          try:
            homeLink = requests.get(homeTeamLink).text
            break
          except:
            sleep(10)
            continue
        homeLinkSoup = BeautifulSoup(homeLink, 'lxml')
        # find its (batting) game logs link
        homeGameBattingLogs = homeLinkSoup.find('div', {'id': 'bottom_nav_container'})
        homeGameBattingLogs = homeGameBattingLogs.find_all('li')
        homeGameBattingLogs = homeGameBattingLogs[1].find('a')
        homeGameBattingLogs = homeGameBattingLogs.get('href')
        homeGameBattingLogs = "http://baseball-reference.com" + homeGameBattingLogs
        # go to its game logs link
        while True:
          try:
            homeBatting = requests.get(homeGameBattingLogs).text
            break
          except:
            sleep(10)
            continue
        homeBattingSoup = BeautifulSoup(homeBatting, 'lxml')
        homeBattingLog = homeBattingSoup.find('table')
        rows = homeBattingLog.find_all('tr')

        # create variables to store the amount of rows in the game logs that are
        # games and not labels (k) and the homeTeam's game # for traversing the dataframe (j)
        k = 0
        j = new_season[s]
        lever = False
        # loop through all of the rows in the game log
        for i in range(len(rows)):
          string = str(rows[i])
          row = rows[i]
          # if the row is a game, its fifth character will start with an 'i' to indicate
          # that it has some id. This is a pattern within the website's html code
          if string[4:5].strip() == 'i':
            # if prev row was double header
            if lever == True:
              if isinstance(doubleHeaderWinOrLoss(row.find('td', {'data-stat': 'game_result'}).text, gameResults, j, homeTeam), str) == False:
                gameResults.iloc[j][homeTeam]['double_header'] = True
                gameResults = doubleHeaderWinOrLoss(row.find('td', {'data-stat': 'game_result'}).text, gameResults, j, homeTeam)
                gameResults.iloc[j][homeTeam]['AB(1)'] = int(row.find('td', {'data-stat': 'AB'}).text)
                gameResults.iloc[j][homeTeam]['H(1)'] = int(row.find('td', {'data-stat': 'H'}).text)
                gameResults.iloc[j][homeTeam]['BB(1)'] = int(row.find('td', {'data-stat': 'BB'}).text)
                gameResults.iloc[j][homeTeam]['HBP(1)'] = int(row.find('td', {'data-stat': 'HBP'}).text)
                gameResults.iloc[j][homeTeam]['SF(1)'] = int(row.find('td', {'data-stat': 'SF'}).text)
                gameResults.iloc[j][homeTeam]['2B(1)'] = int(row.find('td', {'data-stat': '2B'}).text)
                gameResults.iloc[j][homeTeam]['3B(1)'] = int(row.find('td', {'data-stat': '3B'}).text)
                gameResults.iloc[j][homeTeam]['HR(1)'] = int(row.find('td', {'data-stat': 'HR'}).text)
              lever = False
              j += 1            
            if row.find('td', {'data-stat': 'game_result'}).text[0] == 'W' or row.find('td', {'data-stat': 'game_result'}).text[0] == 'L':
              k += 1 
              while nullList.iloc[j][homeTeam] == True:
                j += 1
                if j >= len(nullList):
                  break
              if j >= len(nullList):
                break
              if k == 1:
                # dummy value (no previous game)
                  previous = rows[i]
              elif k > 1:
                previous = rows[i-1]
                prevString = str(previous)
                if k > 2:
                  if prevString[4:5].strip() != 'i':
                    previous = rows[i-2]

              if row.find('td', {'data-stat': 'date_game'}).text[-2:] == '1)':         
                lever = True
                gameResults.iloc[j][homeTeam]['AB'] = int(row.find('td', {'data-stat': 'AB'}).text)
                gameResults.iloc[j][homeTeam]['H'] = int(row.find('td', {'data-stat': 'H'}).text)
                gameResults.iloc[j][homeTeam]['BB'] = int(row.find('td', {'data-stat': 'BB'}).text)
                gameResults.iloc[j][homeTeam]['HBP'] = int(row.find('td', {'data-stat': 'HBP'}).text)
                gameResults.iloc[j][homeTeam]['SF'] = int(row.find('td', {'data-stat': 'SF'}).text)
                gameResults.iloc[j][homeTeam]['2B'] = int(row.find('td', {'data-stat': '2B'}).text)
                gameResults.iloc[j][homeTeam]['3B'] = int(row.find('td', {'data-stat': '3B'}).text)
                gameResults.iloc[j][homeTeam]['HR'] = int(row.find('td', {'data-stat': 'HR'}).text)
                
              elif row.find('td', {'data-stat': 'date_game'}).text[-1] != ')':
                gameResults.iloc[j][homeTeam]['AB'] = int(row.find('td', {'data-stat': 'AB'}).text)
                gameResults.iloc[j][homeTeam]['H'] = int(row.find('td', {'data-stat': 'H'}).text)
                gameResults.iloc[j][homeTeam]['BB'] = int(row.find('td', {'data-stat': 'BB'}).text)
                gameResults.iloc[j][homeTeam]['HBP'] = int(row.find('td', {'data-stat': 'HBP'}).text)
                gameResults.iloc[j][homeTeam]['SF'] = int(row.find('td', {'data-stat': 'SF'}).text)
                gameResults.iloc[j][homeTeam]['2B'] = int(row.find('td', {'data-stat': '2B'}).text)
                gameResults.iloc[j][homeTeam]['3B'] = int(row.find('td', {'data-stat': '3B'}).text)
                gameResults.iloc[j][homeTeam]['HR'] = int(row.find('td', {'data-stat': 'HR'}).text)

                j += 1

          if j >= len(nullList):
              break
    
        unvisited.remove(homeTeam)

    if len(unvisited) == 0:
      break

In [None]:
data.to_csv('data(1).csv')

In [None]:
print(gameResults.iloc[-1]['Los Angeles Dodgers'])

{'runs_for': 9, 'runs_against': 0, 'win/loss': 1, 'home/away': 0, 'double_header': False, 'AB': 37, 'H': 12, 'BB': 9, 'HBP': 2, 'SF': 0, '2B': 1, '3B': 0, 'HR': 1}


Web Scrape for Starting Pitchers' Earned Run Averages Leading up to Each Game

In [None]:
boolean = False
for s in range(20):
  schedule = soup[s].find('div', class_='section_content')
  gamedays = schedule.find_all('div')
  for gameday in gamedays:
    date = correctDateTime(gameday.find('h3').text)
    games = gameday.find_all('p', class_='game')
    for game in games: 
      teams = game.find_all('a')
      awayTeam = teams[0].text
      homeTeam = teams[1].text
      awayRuns = game.text.splitlines()[2]
      homeRuns = game.text.splitlines()[5]
    
      # account for team name changes
      homeTeam = correctTeamName(homeTeam)
      awayTeam = correctTeamName(awayTeam)
      sleep(0.01)
      em = game.find('em')
      boxScore = em.find('a')
      boxScore = boxScore.get('href')
      boxScore = "http://baseball-reference.com" + boxScore
      boxLink = requests.get(boxScore).text
      boxLinkSoup = BeautifulSoup(boxLink, 'lxml')
      
      rows = str(boxLinkSoup.find_all('div',{'class':"section_wrapper"})[1])
      rows = rows.splitlines()
      k = 0
      for k in range(len(rows)):
        row = rows[k]
        if row[1:6].strip() == 'tbody':
          segments = row.split('<')
          portion = segments[4]
          awayPitcherLink = ''
          for i in range(8,100):
            if portion[i] == '"':
              break
            awayPitcherLink += portion[i]
          break
      for j in range(k+1, len(rows)):
        row = rows[j]
        if row[1:6].strip() == 'tbody':
          segments = row.split('<')
          portion = segments[4]
          homePitcherLink = ''
          for i in range(8,100):
            if portion[i] == '"':
              break
            homePitcherLink += portion[i]
          break
      
      awayPitcherStats1 = "http://baseball-reference.com" + awayPitcherLink
      awayPitcherStats2 = requests.get(awayPitcherStats1).text
      awayPitcherStats3 = BeautifulSoup(awayPitcherStats2, 'lxml')
      homePitcherStats = "http://baseball-reference.com" + homePitcherLink
      homePitcherStats = requests.get(homePitcherStats).text
      homePitcherStats = BeautifulSoup(homePitcherStats, 'lxml')

      year = gameday.find('h3').text
      year = year[-4:]

      awayPitcherStats4 = awayPitcherStats3.find('div', {'id': 'bottom_nav_container'})
      homePitcherStats = homePitcherStats.find('div', {'id': 'bottom_nav_container'})
      a = False
      b = False
      for i in awayPitcherStats4:
        if b == True:
          awayPitcherStats5 = i
          b = False
        if a == True:
          a = False
          b = True
        if i.name == 'p' and i.text == 'Pitching Game Logs':
          a = True
      awayPitcherStats6 = awayPitcherStats5.find_all('li')

      a = False
      b = False
      for i in homePitcherStats:
        if b == True:
          homePitcherStats = i
          b = False
        if a == True:
          a = False
          b = True
        if i.name == 'p' and i.text == 'Pitching Game Logs':
          a = True
      homePitcherStats = homePitcherStats.find_all('li')
      
      for r in awayPitcherStats6:
        if r.text == year:
          awayPitcherStats6 = ''
          for i in range(13, len(str(r))):
            if str(r)[i] == '"':
              break
            awayPitcherStats6 += str(r)[i]
          break

      for r in homePitcherStats:
        if r.text == year:
          homePitcherStats = ''
          for i in range(13, len(str(r))):
            if str(r)[i] == '"':
              break
            homePitcherStats += str(r)[i]
          break

      dummy = awayPitcherStats6
      v = 0
      for c in range(len(dummy)):
        if dummy[c] == ';':
          awayPitcherStats6 = dummy[:c] + '&'
          v = c
          break
      p = v
      for c in range(v+1, len(dummy)):
        if dummy[c] == ';':
          awayPitcherStats6 += (dummy[p+1:c] + '&' + dummy[c+1:])
          break
      l = awayPitcherStats6
      awayPitcherStats = "http://baseball-reference.com" + awayPitcherStats6
      awayPitcherStats = requests.get(awayPitcherStats).text
      awayPitcherStats = BeautifulSoup(awayPitcherStats, 'lxml')
      table = awayPitcherStats.find('table')
      rows = table.find_all('tr')
    
      indexList = []
      for i in range(len(rows)):
        if rows[i].has_attr('id') == True: 
          indexList.append(i)

      for j in range(len(indexList)):
        try:
          game_num = int(rows[indexList[j]].find('td', {'data-stat': 'team_game_num'}).text)
        except:
          print(date)
          print(awayTeam)
          print(l)
          boolean = True
          break
        if game_num > 10:
          grab = rows[indexList[j]].find('td', {'data-stat': 'date_game'}).text
          if grab[-1] == ')':
            grab = grab[:-3]
          elif ord(grab[-1]) < 48 or ord(grab[-1]) > 57:
            for i in range(1, len(grab)):
              if ord(grab[-i]) > 48 and ord(grab[-i]) < 57:
                grab = grab[:-(i-1)]
                break
          dt = datetime.strptime(str(grab), '%b %d').date()

          if dt.day == date.day and dt.month == date.month:
            try:
              if nullList.loc[date][homeTeam] != True:
                # if the game is a pitcher's debut, he should have no prior ERA
                if indexList[j-1] > indexList[j]:
                  data.loc[date][homeTeam]['opposing_pitcher_ERA'] = 'N/A'
                else:
                  data.loc[date][homeTeam]['opposing_pitcher_ERA'] = float(rows[indexList[j-1]].find('td', {'data-stat': 'earned_run_avg'}).text)
            except:
              print(date)
              print(awayTeam)
              print(str(float(rows[indexList[j-1]].find('td', {'data-stat': 'earned_run_avg'}).text)))
              boolean = True
            break   

      dummy = homePitcherStats
      v = 0
      for c in range(len(dummy)):
        if dummy[c] == ';':
          homePitcherStats = dummy[:c] + '&'
          v = c
          break
      p = v
      for c in range(v+1, len(dummy)):
        if dummy[c] == ';':
          homePitcherStats += (dummy[p+1:c] + '&' + dummy[c+1:])
          break
      l = homePitcherStats
      homePitcherStats = "http://baseball-reference.com" + homePitcherStats
      homePitcherStats = requests.get(homePitcherStats).text
      homePitcherStats = BeautifulSoup(homePitcherStats, 'lxml')
      table = homePitcherStats.find('table')
      rows = table.find_all('tr')
    
      indexList = []
      for i in range(len(rows)):
        if rows[i].has_attr('id') == True: 
          indexList.append(i)
      for j in range(len(indexList)):
        try:
          game_num = int(rows[indexList[j]].find('td', {'data-stat': 'team_game_num'}).text)
        except:
          print(date)
          print(awayTeam)
          print(l)
          boolean = True
          break
        if game_num > 10:
          grab = rows[indexList[j]].find('td', {'data-stat': 'date_game'}).text
          if grab[-1] == ')':
            grab = grab[:-3]
          elif ord(grab[-1]) < 48 or ord(grab[-1]) > 57:
            for i in range(1, len(grab)):
              if ord(grab[-i]) > 48 and ord(grab[-i]) < 57:
                grab = grab[:-(i-1)]
                break
          dt = datetime.strptime(str(grab), '%b %d').date()
          
          if dt.day == date.day and dt.month == date.month:
            try:
              if nullList.loc[date][homeTeam] != True:
                if indexList[j-1] > indexList[j]:
                  data.loc[date][homeTeam]['pitcher_ERA'] = 'N/A'
                else:
                  data.loc[date][homeTeam]['pitcher_ERA'] = float(rows[indexList[j-1]].find('td', {'data-stat': 'earned_run_avg'}).text)
            except:
              print(date)
              print(homeTeam)
              print(str(float(rows[indexList[j-1]].find('td', {'data-stat': 'earned_run_avg'}).text)))
              boolean = True
            break
      if (boolean==True):
        break
    if (boolean==True):
      break 
  if (boolean==True):
    break

In [None]:
data.to_csv('dataa.csv')

In [None]:
data = pd.read_csv('dataa (6).csv', index_col=0)

indices = data.index.tolist()
for i in range(len(indices)):
  indices[i] = datetime.strptime(indices[i], '%m/%d/%Y').date()
data.index = indices
nullList = data.isnull()

# create a list to hold indices marking the beginning of each new season
dates = data.index.tolist()
new_season = [0]
for i in range(1, len(dates)):
  if dates[i].year > dates[i-1].year:
    new_season.append(i)

def withinFirstTwentyGamedays(index):
  for s in new_season:
    if index >= s and index <= s + 20:
      return True
  return False

for i in range(len(data)):
  for j in range(len(data.columns)):
    if nullList.iloc[i][j] == False:
      try:
        data.iloc[i][j] = ast.literal_eval(data.iloc[i][j])
      except:
        print(i)
        print(data.columns[j])

In [None]:
print(indices[2200])
print(data.iloc[2210]['Los Angeles Dodgers'])

2012-05-03
{'runs_for': 11, 'runs_against': 5, 'win/loss': 1, 'home/away': 1, 'opponent': 'Colorado Rockies', 'double_header': False, 'AB': 30, 'H': 8, 'BB': 10, 'HBP': 0, 'SF': 0, '2B': 5, '3B': 0, 'HR': 1, 'OBP_3-day_avg': 0.36275, 'OBP_5-day_avg': 0.37288, 'OBP_7-day_avg': 0.33333, 'OBP_9-day_avg': 0.33233, 'OBP_11-day_avg': 0.34383, 'OBP_13-day_avg': 0.33678, 'OBP_15-day_avg': 0.33092, 'OBP_season': 0.33636, 'H_conceded': 7, 'IP': 9, 'BB_conceded': 3, 'ER': 4, 'K': 2, 'opposing_pitcher_ERA': 2.84, 'pitcher_ERA': 1.41}


In [None]:
data = gameResults.copy()

Scrape for Starting Pitchers' ERAs for Double Headers

In [None]:
boolean = False
for v in range(len(data)):
  for z in range(len(data.columns)):
    if nullList.iloc[v][z] == False and indices[v].month > 4 and data.iloc[v][z]['double_header'] == True and data.iloc[v][z]['home/away'] == True:
      d = indices[v]
      team = data.columns[z]     
      schedule = soup[d.year-2000].find('div', class_='section_content')
      gamedays = schedule.find_all('div')
      for gameday in gamedays:
        date = correctDateTime(gameday.find('h3').text)
        if date.day == d.day and date.month == d.month:
          games = gameday.find_all('p', class_='game')
          for game in games: 
            teams = game.find_all('a')
            awayTeam = teams[0].text
            homeTeam = teams[1].text
            awayRuns = game.text.splitlines()[2]
            homeRuns = game.text.splitlines()[5]
            # account for team name changes
            homeTeam = correctTeamName(homeTeam)
            awayTeam = correctTeamName(awayTeam)
            if str(homeTeam) != str(team):
              continue
            sleep(0.01)
            em = game.find('em')
            boxScore = em.find('a')
            boxScore = boxScore.get('href')
            boxScore = "http://baseball-reference.com" + boxScore
            boxLink = requests.get(boxScore).text
            boxLinkSoup = BeautifulSoup(boxLink, 'lxml')

            rows = str(boxLinkSoup.find_all('div',{'class':"section_wrapper"})[1])
            rows = rows.splitlines()
            k = 0
            for k in range(len(rows)):
              row = rows[k]
              if row[1:6].strip() == 'tbody':
                segments = row.split('<')
                portion = segments[4]
                awayPitcherLink = ''
                # if i == 107:
                #   print('executed')
                if i == 107:
                  print(portion)
                for u in range(8,100):
                  if portion[u] == '"':
                    # if i == 107:
                    #print('executed')
                    break
                  awayPitcherLink += portion[u]
                if i == 107:
                  print(awayPitcherLink)
                break
            #print(v)
            for j in range(k+1, len(rows)):
              row = rows[j]
              if row[1:6].strip() == 'tbody':
                segments = row.split('<')
                portion = segments[4]
                homePitcherLink = ''
                for i in range(8,100):
                  if portion[i] == '"':
                    break
                  homePitcherLink += portion[i]
                break
            awayPitcherStats1 = "http://baseball-reference.com" + awayPitcherLink
            awayPitcherStats2 = requests.get(awayPitcherStats1).text
            awayPitcherStats3 = BeautifulSoup(awayPitcherStats2, 'lxml')
            homePitcherStats = "http://baseball-reference.com" + homePitcherLink
            homePitcherStats = requests.get(homePitcherStats).text
            homePitcherStats = BeautifulSoup(homePitcherStats, 'lxml')

            rows = str(boxLinkSoup.find_all('div',{'class':"section_wrapper"})[1])
            rows = rows.splitlines()
            k = 0
            for k in range(len(rows)):
              row = rows[k]
              if row[1:6].strip() == 'tbody':
                segments = row.split('<')
                portion = segments[4]
                awayPitcherLink = ''
                for i in range(8,100):
                  if portion[i] == '"':
                    break
                  awayPitcherLink += portion[i]
                break
            for j in range(k+1, len(rows)):
              row = rows[j]
              if row[1:6].strip() == 'tbody':
                segments = row.split('<')
                portion = segments[4]
                homePitcherLink = ''
                for i in range(8,100):
                  if portion[i] == '"':
                    break
                  homePitcherLink += portion[i]
                break
            awayPitcherStats1 = "http://baseball-reference.com" + awayPitcherLink
            awayPitcherStats2 = requests.get(awayPitcherStats1).text
            awayPitcherStats3 = BeautifulSoup(awayPitcherStats2, 'lxml')
            homePitcherStats = "http://baseball-reference.com" + homePitcherLink
            homePitcherStats = requests.get(homePitcherStats).text
            homePitcherStats = BeautifulSoup(homePitcherStats, 'lxml')

            year = gameday.find('h3').text
            year = year[-4:]

            awayPitcherStats4 = awayPitcherStats3.find('div', {'id': 'bottom_nav_container'})
            homePitcherStats = homePitcherStats.find('div', {'id': 'bottom_nav_container'})
            a = False
            b = False
            for i in awayPitcherStats4:
              if b == True:
                awayPitcherStats5 = i
                b = False
              if a == True:
                a = False
                b = True
              if i.name == 'p' and i.text == 'Pitching Game Logs':
                a = True
            awayPitcherStats6 = awayPitcherStats5.find_all('li')

            a = False
            b = False
            for i in homePitcherStats:
              if b == True:
                homePitcherStats = i
                b = False
              if a == True:
                a = False
                b = True
              if i.name == 'p' and i.text == 'Pitching Game Logs':
                a = True
            homePitcherStats = homePitcherStats.find_all('li')
            
            for r in awayPitcherStats6:
              if r.text == year:
                awayPitcherStats6 = ''
                for i in range(13, len(str(r))):
                  if str(r)[i] == '"':
                    break
                  awayPitcherStats6 += str(r)[i]
                break

            for r in homePitcherStats:
              if r.text == year:
                homePitcherStats = ''
                for i in range(13, len(str(r))):
                  if str(r)[i] == '"':
                    break
                  homePitcherStats += str(r)[i]
                break
            dummy = awayPitcherStats6
            m = 0
            for c in range(len(dummy)):
              if dummy[c] == ';':
                awayPitcherStats6 = dummy[:c] + '&'
                m = c
                break
            o = m
            for c in range(m+1, len(dummy)):
              if dummy[c] == ';':
                awayPitcherStats6 += (dummy[o+1:c] + '&' + dummy[c+1:])
                break
            l = awayPitcherStats6
            awayPitcherStats = "http://baseball-reference.com" + awayPitcherStats6
            awayPitcherStats = requests.get(awayPitcherStats).text
            awayPitcherStats = BeautifulSoup(awayPitcherStats, 'lxml')
            table = awayPitcherStats.find('table')
            rows = table.find_all('tr')
            indexList = []
            for i in range(len(rows)):
              if rows[i].has_attr('id') == True: 
                indexList.append(i)
            for j in range(len(indexList)):
              try:
                game_num = int(rows[indexList[j]].find('td', {'data-stat': 'team_game_num'}).text)
              except:
                print(date)
                print(awayTeam)
                print(l)
                boolean = True
                break
              if game_num > 10:
                grab = rows[indexList[j]].find('td', {'data-stat': 'date_game'}).text
                if grab[-1] == ')':
                  grab = grab[:-3]
                elif ord(grab[-1]) < 48 or ord(grab[-1]) > 57:
                  for i in range(1, len(grab)):
                    if ord(grab[-i]) > 48 and ord(grab[-i]) < 57:
                      grab = grab[:-(i-1)]
                      break
                dt = datetime.strptime(str(grab), '%b %d').date()

                if dt.day == date.day and dt.month == date.month:
                  try:
                    if nullList.loc[date][homeTeam] != True:
                      # if the game is a pitcher's debut, he should have no prior ERA
                      if indexList[j-1] > indexList[j]:
                        data.loc[date][homeTeam]['opposing_pitcher_ERA(0)'] = 'N/A'
                      else:
                        data.loc[date][homeTeam]['opposing_pitcher_ERA(0)'] = float(rows[indexList[j-1]].find('td', {'data-stat': 'earned_run_avg'}).text)
                  except:
                    print(date)
                    print(awayTeam)
                    print(str(float(rows[indexList[j-1]].find('td', {'data-stat': 'earned_run_avg'}).text)))
                    boolean = True
                  break   

            dummy = homePitcherStats
            m = 0
            for c in range(len(dummy)):
              if dummy[c] == ';':
                homePitcherStats = dummy[:c] + '&'
                m = c
                break
            o = m
            for c in range(m+1, len(dummy)):
              if dummy[c] == ';':
                homePitcherStats += (dummy[o+1:c] + '&' + dummy[c+1:])
                break
            l = homePitcherStats
            homePitcherStats = "http://baseball-reference.com" + homePitcherStats
            homePitcherStats = requests.get(homePitcherStats).text
            homePitcherStats = BeautifulSoup(homePitcherStats, 'lxml')
            table = homePitcherStats.find('table')
            rows = table.find_all('tr')
          
            indexList = []
            for i in range(len(rows)):
              if rows[i].has_attr('id') == True: 
                indexList.append(i)
            for j in range(len(indexList)):
              try:
                game_num = int(rows[indexList[j]].find('td', {'data-stat': 'team_game_num'}).text)
              except:
                print(date)
                print(awayTeam)
                print(l)
                boolean = True
                break
              if game_num > 10:
                grab = rows[indexList[j]].find('td', {'data-stat': 'date_game'}).text
                if grab[-1] == ')':
                  grab = grab[:-3]
                elif ord(grab[-1]) < 48 or ord(grab[-1]) > 57:
                  for i in range(1, len(grab)):
                    if ord(grab[-i]) > 48 and ord(grab[-i]) < 57:
                      grab = grab[:-(i-1)]
                      break
                dt = datetime.strptime(str(grab), '%b %d').date()
                
                if dt.day == date.day and dt.month == date.month:
                  try:
                    if nullList.loc[date][homeTeam] != True:
                      if indexList[j-1] > indexList[j]:
                        data.loc[date][homeTeam]['pitcher_ERA(0)'] = 'N/A'
                      else:
                        data.loc[date][homeTeam]['pitcher_ERA(0)'] = float(rows[indexList[j-1]].find('td', {'data-stat': 'earned_run_avg'}).text)
                  except:
                    print(date)
                    print(homeTeam)
                    print(str(float(rows[indexList[j-1]].find('td', {'data-stat': 'earned_run_avg'}).text)))                 
                  break
            break

In [None]:
print(data.iloc[1759]['Miami Marlins'])

{'runs_for': 10, 'runs_against': 3, 'win/loss': 1, 'home/away': 1, 'opponent': 'Colorado Rockies', 'double_header': True, 'AB': 38, 'H': 17, 'BB': 6, 'HBP': 0, 'SF': 1, '2B': 3, '3B': 0, 'HR': 2, 'win/loss(1)': 0, 'runs_for(1)': '3', 'runs_against(1)': '7', 'AB(1)': 40, 'H(1)': 13, 'BB(1)': 3, 'HBP(1)': 0, 'SF(1)': 0, '2B(1)': 0, '3B(1)': 0, 'HR(1)': 0, 'OBP_3-day_avg': 0.36842, 'OBP_5-day_avg': 0.39706, 'OBP_7-day_avg': 0.41156, 'OBP_9-day_avg': 0.39523, 'OBP_11-day_avg': 0.39301, 'OBP_13-day_avg': 0.39668, 'OBP_15-day_avg': 0.3871, 'OBP_season': 0.33527, 'opposing_pitcher_ERA': 4.78, 'pitcher_ERA': 4.15, 'opposing_pitcher_ERA(0)': 3.93, 'pitcher_ERA(0)': 4.71}


In [None]:
# double header bug/error handling
for i in range(len(data)):
  for j in range(len(data.columns)):
    if nullList.iloc[i][j] == False and indices[i].month > 4:
      if data.iloc[i][j]['home/away'] == True and data.iloc[i][j]['double_header'] == True:
        if data.iloc[i][data.iloc[i][j]['opponent']]['double_header'] == False:
          print(i)
          print(data.columns[j])
          print(data.iloc[i][j]['opponent'])
        if 'opposing_pitcher_ERA(0)' not in data.iloc[i][j].keys():
          print(i)
          print(data.columns[j])

Get Team Pitching Statistics From Every Game

In [None]:
boolean = False
for s in range(20):
  unvisited = columns.copy()
  schedule = soup[s].find('div', class_='section_content')
  gamedays = schedule.find_all('div')
  for gameday in gamedays:
    games = gameday.find_all('p', class_='game')
    for game in games: 
      teams = game.find_all('a')
      awayTeam = teams[0].text
      homeTeam = teams[1].text
      awayRuns = game.text.splitlines()[2]
      homeRuns = game.text.splitlines()[5]
    
      # account for team name changes
      homeTeam = correctTeamName(homeTeam)
      awayTeam = correctTeamName(awayTeam)

      if homeTeam in unvisited:
        em = game.find('em')
        boxScore = em.find('a')
        boxScore = boxScore.get('href')
        boxScore = "http://baseball-reference.com" + boxScore
        while True:
          try:
            boxLink = requests.get(boxScore).text
            break
          except:
            sleep(10)
            continue
        boxLinkSoup = BeautifulSoup(boxLink, 'lxml')
        link = boxLinkSoup.find('div', {'id': 'bottom_nav_container'})
        links = link.find_all('li')
        homeTeamLink = links[1]
        awayTeamLink = links[2]

        # go to home team's schedule
        homeTeamLink = homeTeamLink.find('a')
        homeTeamLink = homeTeamLink.get('href')
        homeTeamLink = "http://baseball-reference.com" + homeTeamLink
        while True:
          try:
            homeLink = requests.get(homeTeamLink).text
            break
          except:
            sleep(10)
            continue
        homeLinkSoup = BeautifulSoup(homeLink, 'lxml')
        # find its (batting) game logs link
        homePitchingLogs = homeLinkSoup.find('div', {'id': 'bottom_nav_container'})
        homePitchingLogs = homePitchingLogs.find_all('ul')
        homePitchingLogs = homePitchingLogs[1].find_all('li')
        homePitchingLogs = homePitchingLogs[1].find('a')
        homePitchingLogs = homePitchingLogs.get('href')
        homePitchingLogs = "http://baseball-reference.com" + homePitchingLogs
        # go to its game logs link
        while True:
          try:
            homeBatting = requests.get(homePitchingLogs).text
            break
          except:
            sleep(10)
            continue
        homeBattingSoup = BeautifulSoup(homeBatting, 'lxml')
        homeBattingLog = homeBattingSoup.find_all('table')[1]
        rows = homeBattingLog.find_all('tr')
      
        # create variables to store the amount of rows in the game logs that are
        # games and not labels (k) and the homeTeam's game # for traversing the dataframe (j)
        k = 0
        j = new_season[s]
        lever = False
        # loop through all of the rows in the game log
        for i in range(len(rows)):
          string = str(rows[i])
          row = rows[i]
          # if the row is a game, its fifth character will start with an 'i' to indicate
          # that it has some id. This is a pattern within the website's html code
          if string[4:5].strip() == 'i':
            # if prev row was double header
            if lever == True:
              if isinstance(doubleHeaderWinOrLoss(row.find('td', {'data-stat': 'game_result'}).text, data, j, homeTeam), str) == False:
                data.iloc[j][homeTeam]['double_header'] = True
                data = doubleHeaderWinOrLoss(row.find('td', {'data-stat': 'game_result'}).text, data, j, homeTeam)
                data.iloc[j][homeTeam]['H_conceded(1)'] = int(row.find('td', {'data-stat': 'H'}).text)
                data.iloc[j][homeTeam]['IP(1)'] = float(row.find('td', {'data-stat': 'IP'}).text)
                data.iloc[j][homeTeam]['BB_conceded(1)'] = int(row.find('td', {'data-stat': 'BB'}).text)
                data.iloc[j][homeTeam]['ER(1)'] = int(row.find('td', {'data-stat': 'ER'}).text)
                data.iloc[j][homeTeam]['K(1)'] = int(row.find('td', {'data-stat': 'SO'}).text)
                data.iloc[j][homeTeam]['HR_conceded(1)'] = int(row.find('td', {'data-stat': 'HR'}).text)
              lever = False
              j += 1            
            if row.find('td', {'data-stat': 'game_result'}).text[0] == 'W' or row.find('td', {'data-stat': 'game_result'}).text[0] == 'L':
              k += 1 
              while nullList.iloc[j][homeTeam] == True:
                j += 1
                if j >= len(nullList):
                  break
              if j >= len(nullList):
                break
              if k == 1:
                # dummy value (no previous game)
                  previous = rows[i]
              elif k > 1:
                previous = rows[i-1]
                prevString = str(previous)
                if k > 2:
                  if prevString[4:5].strip() != 'i':
                    previous = rows[i-2]

              if row.find('td', {'data-stat': 'date_game'}).text[-2:] == '1)':         
                lever = True
                data.iloc[j][homeTeam]['H_conceded'] = int(row.find('td', {'data-stat': 'H'}).text)
                data.iloc[j][homeTeam]['IP'] = float(row.find('td', {'data-stat': 'IP'}).text)
                data.iloc[j][homeTeam]['BB_conceded'] = int(row.find('td', {'data-stat': 'BB'}).text)
                data.iloc[j][homeTeam]['ER'] = int(row.find('td', {'data-stat': 'ER'}).text)
                data.iloc[j][homeTeam]['K'] = int(row.find('td', {'data-stat': 'SO'}).text)
                data.iloc[j][homeTeam]['HR_conceded'] = int(row.find('td', {'data-stat': 'HR'}).text)
                
              elif row.find('td', {'data-stat': 'date_game'}).text[-1] != ')':
                data.iloc[j][homeTeam]['H_conceded'] = int(row.find('td', {'data-stat': 'H'}).text)
                data.iloc[j][homeTeam]['IP'] = float(row.find('td', {'data-stat': 'IP'}).text)
                data.iloc[j][homeTeam]['BB_conceded'] = int(row.find('td', {'data-stat': 'BB'}).text)
                data.iloc[j][homeTeam]['ER'] = int(row.find('td', {'data-stat': 'ER'}).text)
                data.iloc[j][homeTeam]['K'] = int(row.find('td', {'data-stat': 'SO'}).text)
                data.iloc[j][homeTeam]['HR_conceded'] = int(row.find('td', {'data-stat': 'HR'}).text)
                
                j += 1

          if j >= len(nullList):
              break
    
        unvisited.remove(homeTeam)

    if len(unvisited) == 0:
      break

In [None]:
print(data.iloc[1759]['Miami Marlins'])
print(data.iloc[107]['Baltimore Orioles'])
print(data.iloc[51]['Milwaukee Brewers'])
print(data.iloc[-50]['Los Angeles Dodgers'])

{'runs_for': 10, 'runs_against': 3, 'win/loss': 1, 'home/away': 1, 'opponent': 'Colorado Rockies', 'double_header': True, 'AB': 38, 'H': 17, 'BB': 6, 'HBP': 0, 'SF': 1, '2B': 3, '3B': 0, 'HR': 2, 'win/loss(1)': 0, 'runs_for(1)': '3', 'runs_against(1)': '7', 'AB(1)': 40, 'H(1)': 13, 'BB(1)': 3, 'HBP(1)': 0, 'SF(1)': 0, '2B(1)': 0, '3B(1)': 0, 'HR(1)': 0, 'OBP_3-day_avg': 0.36842, 'OBP_5-day_avg': 0.39706, 'OBP_7-day_avg': 0.41156, 'OBP_9-day_avg': 0.39523, 'OBP_11-day_avg': 0.39301, 'OBP_13-day_avg': 0.39668, 'OBP_15-day_avg': 0.3871, 'OBP_season': 0.33527, 'opposing_pitcher_ERA': 4.78, 'pitcher_ERA': 4.15, 'opposing_pitcher_ERA(0)': 3.93, 'pitcher_ERA(0)': 4.71, 'H_conceded(': 7, 'IP': 9, 'BB_conceded': 6, 'ER': 3, 'K': 9, 'H_conceded(1)': 7, 'IP(1)': 9, 'BB_conceded(1)': 4, 'ER(1)': 7, 'K(1)': 15}
{'runs_for': 7, 'runs_against': 11, 'win/loss': 0, 'home/away': 1, 'opponent': 'Boston Red Sox', 'double_header': True, 'AB': 38, 'H': 11, 'BB': 4, 'HBP': 0, 'SF': 0, '2B': 2, '3B': 0, 'HR':