In [3]:
# Used to retrieve html code from webpages
import requests

# Used to parse webpages
from bs4 import BeautifulSoup

# Import pandas for dataframes
import pandas as pd

# Used for input validation
import re

# Used to correctly format dates
from datetime import datetime

# Import sqlite3 so we can use sqlite databases
import sqlite3

# Import other python files (help with db management)
from python_files.Queries import Queries
from python_files.UseDB import UseDB

# Helps with iterating through differing data structures
import itertools

# 1. Functions To Get Data

In [4]:
def format_date(time_string):
    # Define the format of the input and output string
    input_format = '%I:%M %p, %B %d, %Y'
    output_format = '%Y-%m-%d %H:%M:%S'

    # Convert the input string to a datetime object
    datetime_obj = datetime.strptime(time_string, input_format)

    # Convert the datetime object to a string in the output format
    output_date = datetime_obj.strftime(output_format)
    
    return output_date

In [5]:
def make_game_id(teams, date, game_num):
    
    game_id = date.split()[0]
    for team in teams:
        first_letters = [word[0] for word in team.split()]
        result = ''.join(first_letters)
        
        game_id += "-"
        game_id += result
    
    game_id = game_id + "-" + game_num
        
    return game_id

In [6]:
def format_umps(ump_string):
    
    # Get the umps name out of the string
    word_list = ump_string.split('-')
    ump_name = word_list[1].strip()
    
    return ump_name

In [7]:
def get_game_id(html):
    
    # See if there is a table for stats on the webpage
    elements = html.find_all('div', class_='Boxscore__Team')
    
    # The game was not rained out if there are elements to retrieve
    if elements:
        
        # Get the data and format it into something we can use
        formatted_date = format_date(html.find('div', {'class': 'n8 GameInfo__Meta'}).find('span').text)
        teams = html.find_all('h2', {'class': 'ScoreCell__TeamName ScoreCell__TeamName--displayName truncate db'})
        
        try:
            # If there is a value returned: this game was part of a double header
            game_num = html.find('div', {'class': 'ScoreCell__GameNote di'}).text
            
            # Set the game number - to be used in the game id
            if "1" in game_num:
                game_num = "1"
            elif "2" in game_num:
                game_num = "2"
            else:
                # For the edge cases (MLB Opening Day, etc.)
                game_num = "1"
        except:
            # The game was not part of a double header, just put game_num = 1
            game_num = "1"
        
        # Add the variables to a dictionary
        game_id = make_game_id([teams[0].text, teams[1].text], formatted_date, game_num)
        
        return game_id
    
    # The game on this day was postponed, return an empty list
    else:
        return []

In [8]:
def get_hitter_statistics(html, team):
    """
    Pass in html and receive the batter statistics from an ESPN web page.
    """
    
    # Valuable variables and lists
    batting_categories = ['ID', 'NAME', 'AB', 'R', 'H', 'RBI', 'HR', 'BB', 'K', 'AVG', 'OBP', 'SLG','STRT']
    
    # Correctly gets all of the player's names
    elements = html.find_all('div', class_='Boxscore__Team')
    
    # The game was not rained out if there are elements to retrieve
    if elements:
        
        # Lists to hold important information
        non_starters = []
        player_names = []
        player_stats = []
        
        # Decides whether to get the home or away team's stats
        team = team.lower()
        if team == "away": 
            i = 0  # Will retrieve away team's stats
        elif team == 'home': 
            i = 1  # Will retrive home team's stats
            
        # Get the game_id
        id = get_game_id(html)
        
        # Get all of the batters that did not start the game
        for player_html in elements[i].find_all('div', {'class': 'Boxscore__Athlete pl4'}):
            for player_name in player_html.find('a'):
                non_starters.append(player_name)
                
        # Get all of the player names
        for player in elements[i].find_all('a', {'class': 'Boxscore__Athlete_Name'}):
            name = player.text
            player_names.append(name)

        # Get all of the player numerical stats
        number_stat_element = elements[i].find('div', {'class': 'Table__Scroller'})
        for row in number_stat_element.find_all('tr'):
            one_row_stats = [td.text.strip() for td in row.find_all('td')]
            player_stats.append(one_row_stats)

        # Get rid of the empty list in the first position of the list
        # Get rid of the last element which includes game total numbers that we dont want
        player_stats = player_stats[1:len(player_stats)-1]

        # Format the data into useful dictionaries
        all_things = []
        for index, row in enumerate(player_stats):
            # Add an element for whether the batter started the game or not
            if player_names[index] in non_starters:  # Batter did not start the game
                row.append(0)
            else:
                row.append(1)  # Batter did start the game
                
            # Add the player name to the start of the list
            row.insert(0, player_names[index])
            
            # Add game id to the start of the list
            row.insert(0, id)

            # Zip the stat category with the statistic
            all_things.append(dict(zip(batting_categories, row)))

        # Get rid of ERA values which are not needed
        for index in range(len(all_things)):
            del all_things[index]['AVG']
            del all_things[index]['OBP']
            del all_things[index]['SLG']
            
        return all_things
    
    # The game on this day was postponed, return an empty list
    else:
        return []

In [9]:
def get_pitcher_statistics(html, team):
    """
    Pass in html and receive the pitcher statistics from an ESPN web page.
    """
    
    # Valuable variables and lists
    pitching_categories = ['ID', 'NAME', 'IP', 'H', 'R', 'ER', 'BB', 'K', 'HR', 'ERA', "STRKS", "BALLS", 'STRT']
    
    # Correctly gets all of the player's names
    elements = html.find_all('div', class_='Boxscore__Team')
    
    # The game was not rained out if there are elements to retrieve
    if elements:
        
        # Lists to hold important information
        non_starters = []
        player_names = []
        player_stats = []
        
        # Decides whether to get the home or away team's stats
        team = team.lower()
        if team == "away": 
            i = 2  # Will retrieve away team's stats
        elif team == 'home': 
            i = 3  # Will retrive home team's stats
            
        # Get the game_id
        id = get_game_id(html)
        
        # Get all of the batters that did not start the game
        for player_html in elements[i].find_all('div', {'class': 'Boxscore__Athlete pl4'}):
            for player_name in player_html.find('a'):
                non_starters.append(player_name)
                
        # Get all of the player names
        for player in elements[i].find_all('a', {'class': 'Boxscore__Athlete_Name'}):
            name = player.text
            player_names.append(name)

        # Get all of the player numerical stats
        number_stat_element = elements[i].find('div', {'class': 'Table__Scroller'})
        for row in number_stat_element.find_all('tr'):
            one_row_stats = [td.text.strip() for td in row.find_all('td')]
            player_stats.append(one_row_stats)

        # Get rid of the empty list in the first position of the list
        # Get rid of the last element which includes game total numbers that we dont want
        player_stats = player_stats[1:len(player_stats)-1]
        
        # Go through each player and get ball and strike totals
        for index in range(len(player_names)):
            
            # Get ball and strike totals
            ball_and_strikes = player_stats[index][7].split("-")
            total_pitches = int(ball_and_strikes[0])
            strikes = int(ball_and_strikes[1])
            balls = total_pitches - strikes
            
            # Delete the string ball and stike value and add the only balls and only strike values to the list
            del player_stats[index][7]
            player_stats[index].append(strikes)
            player_stats[index].append(balls)
            
            # Add the player name to the start of the list
            player_stats[index].insert(0, player_names[index])
            
            # Add game id to the start of the list
            player_stats[index].insert(0, id)
        
        # Format the data into useful dictionaries
        all_things = []
        for index, row in enumerate(player_stats):    
            # Add an element for whther the pitcher started the game or not
            if player_names[index] == player_names[0]:
                row.append(1)  # Starting Pitcher
            else:
                row.append(0)  # Relief Pitcher
            all_things.append(dict(zip(pitching_categories, row)))
        
        # Get rid of ERA values which are not needed
        for index in range(len(all_things)):
            del all_things[index]['ERA']
            
        return all_things
    
    # The game on this day was postponed, return an empty list
    else:
        return []

In [10]:
def get_hitter_gamelog_data(url: str):

    # Get the html code from the webpage
    response = requests.get(url=url)

    # Make sure the webpage was returned
    if response.status_code == 200:
        
        # Parse the HTML content of the response
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Get the away and home team's hitting statistics
        away_hitters = get_hitter_statistics(soup, "away")
        home_hitters = get_hitter_statistics(soup, "home")
        
        # Make the dictionaries into pandas DataFrames
        away_hitters = pd.DataFrame(away_hitters)
        home_hitters = pd.DataFrame(home_hitters)
            
    else:
        # The request could not be made
        print(f"Error fetching {url}: {response.status_code}")
        
    return away_hitters, home_hitters

In [11]:
def get_pitcher_gamelog_data(url: str):

    # Get the html code from the webpage
    response = requests.get(url=url)

    # Make sure the webpage was returned
    if response.status_code == 200:
        
        # Parse the HTML content of the response
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Get the away and home team's hitting statistics
        away_pitchers = get_pitcher_statistics(soup, "away")
        home_pitchers = get_pitcher_statistics(soup, "home")
        
        # Make the dictionaries into pandas DataFrames
        away_pitchers = pd.DataFrame(away_pitchers)
        home_pitchers = pd.DataFrame(home_pitchers)
            
    else:
        # The request could not be made
        print(f"Error fetching {url}: {response.status_code}")
        
    return away_pitchers, home_pitchers

In [12]:
def get_inning_data(url):
    # Get the html code from the webpage
    response = requests.get(url)

    # Response was successful if 200
    if response.status_code == 200:
        
        # Parse the HTML content of the response
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # See if there is a table for stats on the webpage
        elements = soup.find_all('div', class_='Boxscore__Team')
        
        # The game was not rained out if there are elements to retrieve
        if elements:
            
            # Get the game id
            id = get_game_id(soup)
            
            # Get the teams
            teams = soup.find_all('h2', {'class': 'ScoreCell__TeamName ScoreCell__TeamName--displayName truncate db'})
            
            # Needed variables
            pattern = r'\d{2,}'  # Regular expression pattern to match numbers larger than 9
            extra_innings = False

            # Get the html content of the by inning scores
            table_rows = soup.find_all('tbody')[1].find_all('tr')
            table_head = soup.find_all('thead')[1].find_all('tr')
            
            # Get the inning data
            headers = [td.text for td in table_head[0].find_all('th')]
            away_team = [td.text for td in table_rows[0].find_all('td')]
            home_team = [td.text for td in table_rows[1].find_all('td')]

            # Filter out strings that match the pattern
            updated_headers = [s for s in headers if not re.search(pattern, s)]
            
            # For any half innings that were not played, update the list to have a
            # `-1` value for those half innings instead of a `-`
            updated_away_team = [int(s) if s != '-' else -1 for s in away_team]
            updated_home_team = [int(s) if s != '-' else -1 for s in home_team]
            
            # If the game went to extra innings, get rid of all data from the extra innings
            # and turn the extra_inning boolean variable to true
            if len(updated_away_team) > 12:
                updated_away_team = updated_away_team[:9] + updated_away_team[-3:]
                updated_home_team = updated_home_team[:9] + updated_home_team[-3:]
                extra_innings = True
            
            # Create instances of the dictionaries
            away_dict = {}
            home_dict = {}
            
            # Add the game id to the dictionaries
            away_dict['ID'] = id
            home_dict['ID'] = id
            
            # Add the away or home team thing to the row
            away_dict['TEAM'] = "away"
            home_dict['TEAM'] = "home"
            
            # Add the team names to the dictionary's
            away_dict['NAME'] = teams[0].text
            home_dict['NAME'] = teams[1].text
            
            # Add the inning data to the dictionaries
            for i in range(len(updated_headers)):
                away_dict[f'{updated_headers[i]}'] = updated_away_team[i]
                home_dict[f'{updated_headers[i]}'] = updated_home_team[i]
            
            # Add a value key pair to indicate whether the game went to extra innings or not
            if extra_innings:
                away_dict['EI'] = 1
                home_dict['EI'] = 1
            else:
                away_dict['EI'] = 0
                home_dict['EI'] = 0
            
            return away_dict, home_dict
        
        # The game on this day was postponed, return an empty list
        else:
            return ({}, {})
        
    else:
        # The request could not be made
        print(f"Error fetching {url}: {response.status_code}")

In [13]:
def get_other_data(url):

    # Get the html code from the webpage
    response = requests.get(url)

    if response.status_code == 200:
        
        # Parse the HTML content of the response
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # See if there is a table for stats on the webpage
        elements = soup.find_all('div', class_='Boxscore__Team')
        
        # The game was not rained out if there are elements to retrieve
        if elements:
        
            # Hold the data
            data_dict = {}
            
            # Get the data
            location = soup.find('span', {'class': 'Location__Text'}).text.strip()
            formatted_date = format_date(soup.find('div', {'class': 'n8 GameInfo__Meta'}).find('span').text)
            stadium = soup.find('div', {'class', 'n6 clr-gray-03 GameInfo__Location__Name'}).text.strip()
            betting_line = soup.find('div', {'class': 'n8 GameInfo__BettingItem flex-expand line'}).text
            game_time_length = soup.find('div', {'class': 'GameInfo__List list inline-flex flex-wrap'}).text.replace("Game Time:", "")
            teams = soup.find_all('h2', {'class': 'ScoreCell__TeamName ScoreCell__TeamName--displayName truncate db'})
            umpire_list = soup.find('ul', class_='GameInfo__List list inline-flex flex-wrap')
            umpires = umpire_list.find_all('li', class_='GameInfo__List__Item')
            
            # Format betting line
            seperated_bl = betting_line.replace("Line:", "").strip().split()
            
            try:
                # If there is a value returned: this game was part of a double header
                game_num = soup.find('div', {'class': 'ScoreCell__GameNote di'}).text
                
                # Set the game number - to be used in the game id
                if "1" in game_num:
                    game_num = "1"
                elif "2" in game_num:
                    game_num = "2"
                else:
                    # For the edge cases (MLB Opening Day, etc.)
                    game_num = "1"
            except:
                # The game was not part of a double header, just put game_num = 1
                game_num = "1"
            
            # Add the variables to a dictionary
            data_dict['GameID'] = make_game_id([teams[0].text, teams[1].text], formatted_date, game_num)
            data_dict['Location'] = location
            data_dict['Date'] = formatted_date
            data_dict['Stadium'] = stadium
            data_dict['Fav. Team'] = seperated_bl[0]
            data_dict['Bet Line'] = seperated_bl[1]
            data_dict['Game Length'] = game_time_length
            data_dict['Away Team'] = teams[0].text
            data_dict['Home Team'] = teams[1].text
            data_dict['Home Plate'] = format_umps(umpires[0].text)
            data_dict['1st base'] = format_umps(umpires[1].text)
            data_dict['2nd base'] = format_umps(umpires[2].text)
            data_dict['3rd base'] = format_umps(umpires[3].text)

            return data_dict
        
        # The game on this day was postponed, return an empty list
        else:
            return []
        
    else:
        # The request could not be made
        print(f"Error fetching {url}: {response.status_code}")

In [14]:
def get_predictive_stats(url: str):
    
    # Get the html code from the webpage
    response = requests.get(url)

    if response.status_code == 200:
        
        # Parse the HTML content of the response
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # See if there is a table for stats on the webpage
        elements = soup.find_all('div', class_='Boxscore__Team')
        
        # The game was not rained out if there are elements to retrieve
        if elements:
        
            # Hold the data
            data_dict = {}
            
            # Get the data
            location = soup.find('span', {'class': 'Location__Text'}).text.strip()
            formatted_date = format_date(soup.find('div', {'class': 'n8 GameInfo__Meta'}).find('span').text)
            stadium = soup.find('div', {'class', 'n6 clr-gray-03 GameInfo__Location__Name'}).text.strip()
            betting_line = soup.find('div', {'class': 'n8 GameInfo__BettingItem flex-expand line'}).text
            teams = soup.find_all('h2', {'class': 'ScoreCell__TeamName ScoreCell__TeamName--displayName truncate db'})
            umpire_list = soup.find('ul', class_='GameInfo__List list inline-flex flex-wrap')
            umpires = umpire_list.find_all('li', class_='GameInfo__List__Item')
            
            # Get the away and home team's hitting statistics
            away_pitchers = get_pitcher_statistics(soup, "away")
            home_pitchers = get_pitcher_statistics(soup, "home")
            
            try:
                # If there is a value returned: this game was part of a double header
                game_num = soup.find('div', {'class': 'ScoreCell__GameNote di'}).text
                
                # Set the game number - to be used in the game id
                if "1" in game_num:
                    game_num = "1"
                elif "2" in game_num:
                    game_num = "2"
                else:
                    # For the edge cases (MLB Opening Day, etc.)
                    game_num = "1"
            except:
                # The game was not part of a double header, just put game_num = 1
                game_num = "1"
            
            # Add the variables to a dictionary
            data_dict['GameID'] = make_game_id([teams[0].text, teams[1].text], formatted_date, game_num)
            data_dict['Date'] = formatted_date
            data_dict['Location'] = location
            data_dict['Stadium'] = stadium
            data_dict['Away Team'] = teams[0].text
            data_dict['Home Team'] = teams[1].text
            data_dict['AWAY_P'] = away_pitchers[0]['NAME']
            data_dict['HOME_P'] = home_pitchers[0]['NAME']
            data_dict['Home Plate'] = format_umps(umpires[0].text)
            data_dict['1st base'] = format_umps(umpires[1].text)
            data_dict['2nd base'] = format_umps(umpires[2].text)
            
            try:
                data_dict['3rd base'] = format_umps(umpires[3].text)
            except IndexError:
                data_dict['3rd base'] = "NONE"
        
            # Get the html content of the by inning scores and get the inning data
            table_rows = soup.find_all('tbody')[1].find_all('tr')
            away_team = [td.text for td in table_rows[0].find_all('td')]
            home_team = [td.text for td in table_rows[1].find_all('td')]
            
            # Get the total runs scored in the first inning
            first_inning_rs = int(away_team[0]) + int(home_team[0])
            
            # If there was no runs scores, set the nrfi value to be true
            nrfi = 1 if first_inning_rs == 0 else 0
            data_dict['NRFI'] = nrfi
        
            return data_dict
        
        # The game on this day was postponed, return an empty list
        else:
            return {}
        
    else:
        # The request could not be made
        print(f"Error fetching {url}: {response.status_code}")