In [1]:
# imports
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

# Getting Website for each game ID in 2023 season
gameID = 598903
# DataFrame that stores results
pbpdf_final = pd.DataFrame()

while gameID <= 598909: 
    website = f"https://baseball.pointstreak.com/boxscore.html?gameid={gameID}"
    result = requests.get(website)
    content = result.text
    
    # Convert html with BeautifulSoup
    soup = BeautifulSoup(content, 'html.parser')
    
    # Find all inning elements
    innings = soup.find_all(class_='pbpinning')

    # List to store dictionaries of inning and play-by-play data
    game_data = []
    
    # Iterate through each inning
    for inning_elem in innings:
        # Get the text of the first <td> element inside the inning
        inning_info = inning_elem.find('td').text.strip()
        
        # Use regular expressions to extract the inning information
        inning_match = re.match(r'(Top of|Bottom of)\s+(\d+)', inning_info)
        if inning_match:
            inning = f"{inning_match.group(1)} {inning_match.group(2)}"
        else:
            inning = None
        
        # Find all play-by-play data within the current inning element
        play_by_play = inning_elem.find_all('td')[2:]  # Exclude the first <td> as it contains the inning info
        
        # Iterate through each play-by-play data within the inning
        for play_elem in play_by_play:
            play_data = play_elem.text.strip()
            if inning:
                game_data.append({'Inning': inning, 'Play by Play': play_data})
    
    # Create DataFrame for the current game's pbp data
    pbpdata = pd.DataFrame(game_data)
    
    # Consolidate current game data to the final DataFrame
    pbpdf_final = pd.concat([pbpdf_final, pbpdata], ignore_index=True)
    
    # Move to the next game
    gameID += 1

# Set Inning as index to help for analysis
pbpdf_final.set_index('Inning', inplace=True)




In [2]:
#Pull Play Data for Analysis
# Define conditions for specific plays
#foul_condition = pbpdf_final['Play by Play'].str.count('Foul', case=False)

#At Bat Outcomes
pbpdf_final = pbpdf_final.assign(SwingStrike=pbpdf_final['Play by Play'].str.count('Swinging Strike'))
pbpdf_final = pbpdf_final.assign(CalledStrike=pbpdf_final['Play by Play'].str.count('Called Strike'))
pbpdf_final = pbpdf_final.assign(Balls=pbpdf_final['Play by Play'].str.count('Ball'))
pbpdf_final = pbpdf_final.assign(Foul=pbpdf_final['Play by Play'].str.count('Foul'))
pbpdf_final = pbpdf_final.assign(Hit=pbpdf_final['Play by Play'].str.count('Hit'))

#Hit Data
pbpdf_final = pbpdf_final.assign(Single=pbpdf_final['Play by Play'].str.count('single'))
pbpdf_final = pbpdf_final.assign(Double=pbpdf_final['Play by Play'].str.count('double'))
pbpdf_final = pbpdf_final.assign(Triple=pbpdf_final['Play by Play'].str.count('triple'))
pbpdf_final = pbpdf_final.assign(HomeRun=pbpdf_final['Play by Play'].str.count('HomeRun'))
#Location Data
pbpdf_final = pbpdf_final.assign(Location=pbpdf_final['Play by Play'].str.contains('1'))
pbpdf_final = pbpdf_final.assign(Location=pbpdf_final['Play by Play'].str.contains('2'))
pbpdf_final = pbpdf_final.assign(Location=pbpdf_final['Play by Play'].str.contains('3'))
pbpdf_final = pbpdf_final.assign(Location=pbpdf_final['Play by Play'].str.contains('4'))
pbpdf_final = pbpdf_final.assign(Location=pbpdf_final['Play by Play'].str.contains('5'))
pbpdf_final = pbpdf_final.assign(Location=pbpdf_final['Play by Play'].str.contains('6'))
pbpdf_final = pbpdf_final.assign(Location=pbpdf_final['Play by Play'].str.contains('7'))
pbpdf_final = pbpdf_final.assign(Location=pbpdf_final['Play by Play'].str.contains('8'))
pbpdf_final = pbpdf_final.assign(Location=pbpdf_final['Play by Play'].str.contains('9'))

In [3]:
pbpdf_final

Unnamed: 0_level_0,Play by Play,SwingStrike,CalledStrike,Balls,Foul,Hit,Single,Double,Triple,HomeRun,Location
Inning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Top of 1,London Majors,0,0,0,0,0,0,0,0,0,False
Top of 1,#28 Austin Wilkie,0,0,0,0,0,0,0,0,0,False
Top of 1,"Foul, 28 Austin Wilkie advances to 1st (single)",0,0,0,1,0,1,0,0,0,False
Top of 1,#21 Tommy Reyes-Cruz,0,0,0,0,0,0,0,0,0,False
Top of 1,"Foul, 28 Austin Wilkie advances to 2nd (error ...",0,0,1,1,0,0,0,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...
Bottom of 9,Offensive Substitution,0,0,0,0,0,0,0,0,0,False
Bottom of 9,14 Will Pollard subs for Daniel Battel.,0,0,0,0,0,0,0,0,0,False
Bottom of 9,#14 Will Pollard,0,0,0,0,0,0,0,0,0,False
Bottom of 9,"Ball, Foul, Ball, Foul, Foul, 14 Will Pollard ...",0,0,2,3,0,0,0,0,0,False
