Universal Functions

In [2]:
def print_table(matrix):
    for row in matrix:
        for element in row:
            print(element, end='\t')  # Separate elements by a tab (or any delimiter you prefer)
        print()  # Move to the next line for the next row

# retry mechanism
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
def make_request(url):
    session = requests.Session()
    retry_strategy = Retry(
        total=3,
        status_forcelist=[500, 502, 503, 504],
        allowed_methods=["GET", "POST"],
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    try:
        response = session.get(url, headers = headers)
        response.raise_for_status()
        return response.text
    except:
        return ''

def modern_style_scrape(link):
    response = make_request(link)
    soup = BeautifulSoup(response, 'html.parser')

    if response == '':
        # print(link)
        driver.get(link)
        # driver.implicitly_wait(2) # wait a bit
        page_source = driver.page_source
        soup = BeautifulSoup(page_source)
        driver.quit()

    individual_stats = soup.find('section', id='individual-stats')
    tables = individual_stats.find_all('table')

    score_table = soup.find('table')
    score_table = score_table.find_all('td')
    for i in range(1, len(score_table)):
        try:
            x = int(score_table[i+1].text)
        except:
            ascore = float(score_table[i].text)
            home_team = score_table[i+1].find_all('span', class_='hide-on-small-down')[0].get_text().strip().lower()
            break

    away_team = score_table[0].find_all('span', class_='hide-on-small-down')[0].get_text().strip().lower()
    hscore = float(score_table[-1].text)

    home_team = home_team.replace("Winner", "")
    away_team = away_team.replace("Winner", "")

    if (home_team == 'texas' and hscore > ascore) or (away_team == 'texas' and hscore < ascore):
        tex_win = "Win"
    elif (home_team == 'texas' and hscore < ascore) or (away_team == 'texas' and hscore > ascore):
        tex_win = "Loss"
    else:
        tex_win = "Tie"

    # get date
    big_html = soup.text
    date_index = big_html.find('Date:')
    date_endex = big_html.find('Site:')
    date = big_html[date_index + 6: date_endex].strip()
    date = datetime.strptime(date, "%m/%d/%Y")

    # make gameid
    gameid = away_team.replace(" ", "").lower() + '_' + home_team.replace(" ", "").lower() + '_' + str(date.month) + '_' + str(date.day) + '_' + str(date.year)

    if home_team == 'texas':
        tex_pass = tables[1]
        tex_rush = tables[3]
        tex_rec = tables[5]
    else:
        tex_pass = tables[0]
        tex_rush = tables[2]
        tex_rec = tables[4]

    tex_pass_stats = tex_pass.find_all('td')
    for i in range(len(tex_pass_stats)):  # convert passers to text
        tex_pass_stats[i] = tex_pass_stats[i].text.strip()
    passer_temp = []
    tex_pass_stats_final = []
    for i in range(len(tex_pass_stats)):
        passer_temp.append(tex_pass_stats[i])
        if len(passer_temp)/8 == 1:
            tex_pass_stats_final.append(passer_temp)
            passer_temp = []
    for i in range(len(tex_pass_stats_final)):
        for j in range(1, len(tex_pass_stats_final[i])):
            tex_pass_stats_final[i][j] = float(tex_pass_stats_final[i][j])
    tex_pass_stats_final = pd.DataFrame(tex_pass_stats_final)
    tex_pass_stats_final.columns = ['Player', 'Completions', 'Pass Attempts', 'Pass Yards', 'Passing TDs', 'Interceptions', 'Longest Pass', 'Sacks Taken']

    tex_rush_stats = tex_rush.find_all('td')
    for i in range(len(tex_rush_stats)):  # convert passers to text
        tex_rush_stats[i] = tex_rush_stats[i].text.strip()
    rusher_temp = []
    tex_rush_stats_final = []
    for i in range(len(tex_rush_stats)):
        rusher_temp.append(tex_rush_stats[i])
        if len(rusher_temp)/8 == 1:
            tex_rush_stats_final.append(rusher_temp)
            rusher_temp = []
    for i in range(len(tex_rush_stats_final)):
        for j in range(1, len(tex_rush_stats_final[i])):
            tex_rush_stats_final[i][j] = float(tex_rush_stats_final[i][j])
    tex_rush_stats_final = pd.DataFrame(tex_rush_stats_final)
    tex_rush_stats_final.columns = ['Player', 'Rush Attempts', 'Rush Yards Gained', 'Rush Yards Lost', 'Net Rush Yards', 'Rushing TDs', 'Longest Rush', 'Yards Per Rush']

    tex_rec_stats = tex_rec.find_all('td')
    for i in range(len(tex_rec_stats)):  # convert passers to text
        tex_rec_stats[i] = tex_rec_stats[i].text.strip()
    recer_temp = []
    tex_rec_stats_final = []
    for i in range(len(tex_rec_stats)):
        recer_temp.append(tex_rec_stats[i])
        if len(recer_temp)/5 == 1:
            tex_rec_stats_final.append(recer_temp)
            recer_temp = []
    for i in range(len(tex_rec_stats_final)):
        for j in range(1, len(tex_rec_stats_final[i])):
            tex_rec_stats_final[i][j] = float(tex_rec_stats_final[i][j])
    tex_rec_stats_final = pd.DataFrame(tex_rec_stats_final)
    tex_rec_stats_final.columns = ['Player', 'Catches', 'Receiving Yards', 'Receiving TDs', 'Longest Reception']

    full_game_stats = pd.merge(
        tex_pass_stats_final, tex_rush_stats_final, how = "outer", on = "Player")

    full_game_stats = pd.merge(
        full_game_stats, tex_rec_stats_final, how = 'outer', on = "Player")

    full_game_stats = full_game_stats.fillna(0)

    full_game_stats['GameID'] = gameid
    full_game_stats['Date'] = date
    full_game_stats['Home Team'] = home_team
    full_game_stats['Away Team'] = away_team
    full_game_stats['Home Score'] = hscore
    full_game_stats['Away Score'] = ascore
    full_game_stats['Texas Result'] = tex_win
    full_game_stats['Link'] = link

    return full_game_stats

Years 1947 - 2007 (excluding the three games in 98) -> stored in master_stats_1

In [2]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import warnings
from IPython.display import display, HTML
from tqdm import tqdm
import time
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# use selenium
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--incognito")
service = Service()
driver = webdriver.Chrome(service=service, options=options)
driver.maximize_window()

# retry mechanism
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
def make_request(url):
    session = requests.Session()
    retry_strategy = Retry(
        total=3,
        status_forcelist=[500, 502, 503, 504],
        allowed_methods=["GET", "POST"],
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    try:
        response = session.get(url, headers = headers)
        response.raise_for_status()
        return response.text
    except:
        return ''

# set up master dataframe
stats = {
    'Player': [],
    'Completions': [],
    'Pass Attempts': [],
    'Interceptions': [],
    'Pass Yards': [],
    'Passing TDs': [],
    'Longest Pass': [],
    'Sacks Taken': [],
    'Rush Attempts': [],
    'Rush Yards Gained': [],
    'Rush Yards Lost': [],
    'Net Rush Yards': [],
    'Rushing TDs': [],
    'Longest Rush': [],
    'Yards Per Rush': [],
    'Catches': [],
    'Receiving Yards': [],
    'Receiving TDs': [],
    'Longest Reception': [],
    'GameID': [],
    'Link': []
}
master_stats_1 = pd.DataFrame(stats)

games = {
    'Home Team' : [],
    'Away Team'	: [],
    'Home Score' : [],
    'Away Score' : [],
    'Texas Result' : [],
    'Box Score' : []
}
master_games_1 = pd.DataFrame(games)

missed_games_1 = []

# get links for each season
years_list = [str(47), str(48)]
for i in range(50, 100):
    years_list.append(str(i))
for i in range(0, 8):
    years_list.append(f"{i:02d}")
# years_list.append('08')

season_links = []
for i in years_list:
    season_link = 'https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/' + i + '/teamstat.htm'
    season_links.append(season_link)

for x in tqdm(range(len(season_links)), desc = "Database building..."):
    season = season_links[x] # paste specific season link here when troubleshooting
    year = years_list[x] # change year manually to when troubleshooting
    box_score_links = []
    if year == '98':
        box_score_links = ['https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/98/ut-ucla.htm',
                            'https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/98/ut-msu.htm',
                            'https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/98/ut-ksu.htm',
                            'https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/98/ut-ou.htm',
                            'https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/98/ut-bu.htm',
                            'https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/98/ut-nu.htm',
                            'https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/98/ut-osu.htm',
                            'https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/98/ut-ttu.htm',
                            'https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/98/ut-tam.htm']
    # elif year == '08':
    #     # 2008 links
    #     box_score_links = ['http://stats.texassports.com/sports/m-footbl/2008-2009/ut2.html', 
    #                         'http://stats.texassports.com/sports/m-footbl/2008-2009/ut3.html', 
    #                         'http://stats.texassports.com/sports/m-footbl/2008-2009/ut4.html', 
    #                         'http://stats.texassports.com/sports/m-footbl/2008-2009/ut5.html', 
    #                         'http://stats.texassports.com/sports/m-footbl/2008-2009/ut6.html', 
    #                         'http://stats.texassports.com/sports/m-footbl/2008-2009/ut7.html', 
    #                         'http://stats.texassports.com/sports/m-footbl/2008-2009/ut8.html', 
    #                         'http://stats.texassports.com/sports/m-footbl/2008-2009/ut9.html', 
    #                         'http://stats.texassports.com/sports/m-footbl/2008-2009/ut10.html', 
    #                         'http://stats.texassports.com/sports/m-footbl/2008-2009/ut11.html', 
    #                         'http://stats.texassports.com/sports/m-footbl/2008-2009/ut12.html', 
    #                         'http://stats.texassports.com/sports/m-footbl/2008-2009/ut13.html']
    else:
        # open season page
        driver.get(season)
        texas_sports_soup = BeautifulSoup(driver.page_source)
        table = texas_sports_soup.find("table")
        rows = table.tbody.find_all('tr')[1:]
        for row in rows:
            box_score = row.find_all('td')[-1]
            try:
                link_tail_temp = box_score.font.a['href']
                if link_tail_temp == '../../index1919.html':
                    missed_games_1.append(row.find_all('td')[-2].get_text())
                else:
                    built_link = 'https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/' + year + '/' + link_tail_temp
                    box_score_links.append(built_link)
            except:
                pass

            # must catch the random 90s TAMU games that are mislinked
            if year in [str(91), str(92), str(93), str(95), str(96)]:
                box_score_links.append('https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/' + year + '/' + 'UT-A&M.HTM')

    for link in box_score_links:
        # print(link_tail, year)        
        # Get full page soup
        # link = 'https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/92/ut_nt.htm' # troubleshooting
        response = make_request(link)
        temp_box_soup = BeautifulSoup(response, 'html.parser')

        if response == '':
            driver.get(link)
            # driver.get('https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/06/ut11.htm') # for troubleshooting
            temp_box_soup = BeautifulSoup(driver.page_source)
        temp_text = temp_box_soup.text
        temp_text = temp_text.replace("Texas Longhorns", "Texas")
        temp_text = temp_text.replace('TEXAS', 'Texas')

        if year == '06':
            # Get away team
            away_index = 0
            away_endex = temp_text.find(' vs ')
            away_team = temp_text[away_index: away_endex].strip().lower()

            # Get home team
            home_index = temp_text.find(' vs ')
            home_endex = temp_text.find(' (')
            home_team = temp_text[home_index + 4: home_endex].strip().lower()

            # Get game date
            date_index = temp_text.find('(')
            date_endex = temp_text.find(")")
            date = temp_text[date_index + 1: date_endex].strip()
            date = date.replace(",", "")
            date = date[:-4] + ',' + date[-4:]
            date = date.replace("Sept", "Sep")
            date = date.replace(" ", "")
            try:
                date = datetime.strptime(date, "%b%d,%Y")
            except ValueError:
                date = datetime.strptime(date, "%b.%d,%Y")
            
            # get away score
            temp_index = temp_text.find('Score by Quarters     1  2  3  4   Score')
            temp_text_new = temp_text[temp_index:]
            temp_index = temp_text_new.find('\n')
            temp_text_new = temp_text_new[temp_index + 1:]
            temp_index = temp_text_new.find('\n')
            temp_text_new = temp_text_new[temp_index + 1:]
            ascore_index = temp_text_new.find(' - ') + 3
            ascore_endex = temp_text_new.find('Record: ')
            ascore = temp_text_new[ascore_index:ascore_endex].strip()
            ascore = float(ascore)

            # get home score
            temp_text_new = temp_text_new[ascore_endex + 1:]
            hscore_index = temp_text_new.find(' - ') + 3
            hscore_endex = temp_text_new.find('Record: ')
            hscore = temp_text_new[hscore_index:hscore_endex].strip()
            hscore = float(hscore)
            
        else:
            # Get game date
            date_index = temp_text.find('Date: ')
            date_endex = temp_text.find("Site: ")
            date = temp_text[date_index + 6: date_endex].strip()
            if date == "0ct 10, 1959":
                date = "Oct 10, 1959"
            date = date.replace(",", "")
            date = date[:-4] + ',' + date[-4:]
            date = date.replace("Sept", "Sep")
            date = date.replace(" ", "")
            try:
                date = datetime.strptime(date, "%b%d,%Y")
            except ValueError:
                date = datetime.strptime(date, "%b.%d,%Y")      

            # Get away team
            away_index = 0
            away_endex = temp_text.find(' vs ')
            away_team = temp_text[away_index: away_endex].strip().lower()

            # Get home team
            home_index = temp_text.find(' vs ')
            home_endex = temp_text.find(' (')
            home_team = temp_text[home_index + 4: home_endex].strip().lower()

            # get away score
            temp_index = temp_text.find('Score by Quarters     1  2  3  4   Score')
            temp_text_new = temp_text[temp_index:]
            temp_index = temp_text_new.find('\n')
            temp_text_new = temp_text_new[temp_index + 1:]
            temp_index = temp_text_new.find('\n')
            temp_text_new = temp_text_new[temp_index + 1:]
            ascore_index = temp_text_new.find(' - ') + 3
            ascore_endex = temp_text_new.find('\n')
            ascore = temp_text_new[ascore_index:ascore_endex].strip()
            ascore = float(ascore)

            # get home score
            temp_text_new = temp_text_new[ascore_endex + 1:]
            hscore_index = temp_text_new.find(' - ') + 3
            hscore_endex = temp_text_new.find('\n')
            hscore = temp_text_new[hscore_index:hscore_endex].strip()
            hscore = float(hscore)

        # did texas win?
        if (home_team == 'texas' and hscore > ascore) or (away_team == 'texas' and hscore < ascore):
            tex_win = "Win"
        elif (home_team == 'texas' and hscore < ascore) or (away_team == 'texas' and hscore > ascore):
            tex_win = "Loss"
        else:
            tex_win = "Tie"

        # get UT box score text
        temp = temp_box_soup.find('font', string = "Individual Statistics")
        temp = temp.find_next('font', string = "Individual Statistics")
        temp = temp.find_next('pre').text
        temp = temp.replace('Texas Longhorns', 'Texas')
        temp = temp.replace('TEXAS', 'Texas')
        start = temp.find('\nTexas\n')
        temp = temp[start:]
        end = temp.find('Punting               No.  Yds   Avg Long In20')
        temp = temp[:end]

        # Truncate box score for rushing stats
        rush_start = temp.find('Rushing              No Gain Loss  Net TD Lg  Avg')
        rush_end = temp.find('Passing              ') - 2
        rush_temp = temp[rush_start:rush_end]

        # Get rushing stats
        rush_stats = []    
        line_break = rush_temp.find('\n')
        header = rush_temp[0:line_break].split()
        rush_stats.append(header)
        rush_temp = rush_temp[line_break:]
        line_break = rush_temp.find('\n')
        rush_temp = rush_temp[line_break + 1:]

        # Now rush temp has no header
        line_break = rush_temp.find('\n')
        rush_temp = rush_temp[line_break + 1:]
        while True:
            line_break = rush_temp.find('\n')
            line = rush_temp[0:line_break + 1]
            game_stat = line.split()
            if rush_temp.find('\n') == -1:
                line = rush_temp
                game_stat = line.split()          
                rush_stats.append(game_stat)
                rush_temp = rush_temp[line_break + 1:]
                break
            else:
                # combines first n name columns
                while len(game_stat) > 8: 
                    name = game_stat[0] + ' ' + game_stat[1]
                    stats = game_stat[2:]
                    game_stat = [name] + stats
                rush_stats.append(game_stat)
                rush_temp = rush_temp[line_break + 1:]
        
        # Truncate box score for passing stats
        pass_start = temp.find('Passing              ')
        pass_end = temp.find('Receiving             No.  Yds   TD Long') - 2
        pass_temp = temp[pass_start:pass_end]

        # Get passing stats
        pass_stats = []    
        line_break = pass_temp.find('\n')
        header = pass_temp[0:line_break].split()
        pass_stats.append(header)
        pass_temp = pass_temp[line_break:]
        line_break = pass_temp.find('\n')
        pass_temp = pass_temp[line_break + 1:]

        # Now pass temp has no header
        line_break = pass_temp.find('\n')
        pass_temp = pass_temp[line_break + 1:]
        while True:
            line_break = pass_temp.find('\n')
            line = pass_temp[0:line_break + 1]
            game_stat = line.split()
            if pass_temp.find('\n') == -1:
                line = pass_temp
                game_stat = line.split()
                pass_stats.append(game_stat)
                pass_temp = pass_temp[line_break + 1:]
                break
            else:
                # combines first n name columns
                while len(game_stat) > 6: 
                    name = game_stat[0] + ' ' + game_stat[1]
                    stats = game_stat[2:]
                    game_stat = [name] + stats
                pass_stats.append(game_stat)
                pass_temp = pass_temp[line_break + 1:]
        
        # Truncate box score for rec stats
        rec_start = temp.find('Receiving             No.  Yds   TD Long')
        rec_temp = temp[rec_start:-2]

        # Get rec stats
        rec_stats = []    
        line_break = rec_temp.find('\n')
        header = rec_temp[0:line_break].split()
        rec_stats.append(header)
        rec_temp = rec_temp[line_break:]
        line_break = rec_temp.find('\n')
        rec_temp = rec_temp[line_break + 1:]

        # Now rec temp has no header
        line_break = rec_temp.find('\n')
        rec_temp = rec_temp[line_break + 1:]
        while True:
            line_break = rec_temp.find('\n')
            line = rec_temp[0:line_break + 1]
            game_stat = line.split()
            if rec_temp.find('\n') == -1:
                line = rec_temp
                game_stat = line.split()
                rec_stats.append(game_stat)
                rec_temp = rec_temp[line_break + 1:]
                break
            else:
                # combines first n name columns
                while len(game_stat) > 5: 
                    name = game_stat[0] + ' ' + game_stat[1]
                    stats = game_stat[2:]
                    game_stat = [name] + stats
                rec_stats.append(game_stat)
                rec_temp = rec_temp[line_break + 1:]

        ##############################################################
        # Now that we have the stats in 2d lists, we need to make sure they aren't just strings

        # First, we must address the formatting of the passing cmp-att-int format
        for row in pass_stats:
            new_element = row.pop(1).split('-')
            row[1:1] = new_element
        
        # Next, we must make sure the elements are floats and not strings
        frames = [pass_stats, rush_stats, rec_stats]
        for frame in frames:
            for i in range(1,len(frame)):
                for j in range(1,len(frame[i])):
                    frame[i][j] = float(frame[i][j])
                    
        # Now, we make the arrays into dataframes using panda
        # admittedly i shouldve done this earlier but oh well
        rush_data = pd.DataFrame(rush_stats[1:])
        rush_data.columns = ['Player', 'Rush Attempts', 'Rush Yards Gained', 'Rush Yards Lost', 'Net Rush Yards', 'Rushing TDs', 'Longest Rush', 'Yards Per Rush']

        pass_data = pd.DataFrame(pass_stats[1:])
        if 40 < float(year) < 89:
            pass_data.columns = ['Player', 'Completions', 'Pass Attempts', 'Interceptions', 'Pass Yards', 'Passing TDs', 'Longest Pass', 'Sacks Taken']
        else:
            pass_data.columns = ['Player', 'Pass Attempts', 'Completions', 'Interceptions', 'Pass Yards', 'Passing TDs', 'Longest Pass', 'Sacks Taken']


        rec_data = pd.DataFrame(rec_stats[1:])
        rec_data.columns = ['Player', 'Catches', 'Receiving Yards', 'Receiving TDs', 'Longest Reception']
        
        # Finally, time to merge the data into one full dataframe for the full game
        full_game_data = pd.merge(
            pass_data, rush_data, how = "outer", on = "Player"
        )
        full_game_data = pd.merge(
            full_game_data, rec_data, how = 'outer', on = "Player"
        )
        full_game_data = full_game_data.fillna(0)

        # small thing but i want to take the ellipsis out of the totals category
        full_game_data = full_game_data.replace('Totals...', 'Total')

        # now make the gameid
        gameid = away_team.replace(" ", "").lower() + '_' + home_team.replace(" ", "").lower() + '_' + str(date.month) + '_' + str(date.day) + '_' + str(date.year)
        gameid_list = [gameid]

        '''
        game_df = {'Home Team' : [home_team],
                'Away Team' : [away_team],
                'Home Score' : [hscore],
                'Away Score' : [ascore],
                'Texas Result' : [tex_win],
                'Box Score' : [full_game_data]
                }
        
        game_df = pd.DataFrame(game_df, index = gameid_list)

        # finally append it to the master games
        master_games = pd.concat([master_games, game_df], ignore_index = True)

        # empty out game_df
        game_df = pd.DataFrame()
        '''

        ##############################################################
        # the last thing I want to do is to create one large dataframe with every single game performance ever
        # this will contain duplicate players for their different performances in different games
        # much less concise, much more usefull (probably)
        # first add gameid column
        full_game_data['GameID'] = gameid
        full_game_data['Date'] = date
        full_game_data['Home Team'] = home_team
        full_game_data['Away Team'] = away_team
        full_game_data['Home Score'] = hscore
        full_game_data['Away Score'] = ascore
        full_game_data['Texas Result'] = tex_win
        full_game_data['Link'] = link
        
        # now add it to the master stats
        master_stats_1 = pd.concat([master_stats_1, full_game_data], ignore_index = True)

        # finally empty out the full game dataframe
        full_game_data = pd.DataFrame() 
        
# print(master_stats_1)
master_stats_1.to_csv('master_stats_1.csv', index = False)
# master_games.to_csv('master_games.csv', index = False)

driver.quit()

Database building...: 100%|██████████| 60/60 [03:13<00:00,  3.22s/it]


get stats from blue table style (3 games in 98 and 2009 -2014) 
note: i have skipped 2008 for the moment, also missing vs arkansas 2014 and vs kansas 2009 vs nebraska 2009
-> stored in master_stats_2

In [3]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import warnings
from IPython.display import display, HTML
from tqdm import tqdm
import time
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# use selenium
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--incognito")
service = Service()
driver = webdriver.Chrome(service=service, options=options)
driver.maximize_window()

# retry mechanism
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
def make_request(url):
    session = requests.Session()
    retry_strategy = Retry(
        total=3,
        status_forcelist=[500, 502, 503, 504],
        allowed_methods=["GET", "POST"],
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    try:
        response = session.get(url, headers = headers)
        response.raise_for_status()
        return response.text
    except:
        return ''

# set up master dataframe
stats = {
    'Player': [],
    'Completions': [],
    'Pass Attempts': [],
    'Interceptions': [],
    'Pass Yards': [],
    'Passing TDs': [],
    'Longest Pass': [],
    'Sacks Taken': [],
    'Rush Attempts': [],
    'Rush Yards Gained': [],
    'Rush Yards Lost': [],
    'Net Rush Yards': [],
    'Rushing TDs': [],
    'Longest Rush': [],
    'Yards Per Rush': [],
    'Catches': [],
    'Receiving Yards': [],
    'Receiving TDs': [],
    'Longest Reception': [],
    'GameID': [],
    'Link': []
}
master_stats_2 = pd.DataFrame(stats)

games = {
    'Home Team' : [],
    'Away Team'	: [],
    'Home Score' : [],
    'Away Score' : [],
    'Texas Result' : [],
    'Box Score' : []
}
master_games_2 = pd.DataFrame(games)

missed_games_2 = []

driver.get('https://texassports.com/sports/2013/7/21/FB_0721134841.aspx?id=131')
texas_sports_soup = BeautifulSoup(driver.page_source)
table_list = texas_sports_soup.find_all("table")

game_links = []
# 1998 links
rows = table_list[24].tbody.find_all('tr')[2:]
for row in rows:
    box_score = row.find_all('td')[-1]
    try:
        link_temp = box_score.a['href']
        game_links.append(link_temp)
    except:
        pass

game_links = ['https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/98/ut-isu.htm',
              'https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/98/ut-ru.htm',
              'https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/98/ut-nmsu.htm']

driver.get('https://texassports.com/sports/2013/7/21/FB_0721134841.aspx?id=131')
texas_sports_soup = BeautifulSoup(driver.page_source)
table_list = texas_sports_soup.find_all("table")

# links for 2009-2014
for table in table_list[8:14]:
    # get year
    # temp = table.find('td').get_text()
    # year_index = temp.find('\n')
    # year = temp[year_index - 4:year_index]

    # get box score links
    rows = table.tbody.find_all('tr')[3:]
    for row in rows:
        box_score = row.find_all('td')[-1]
        try:
            link_temp = box_score.find_all('a')[1]['href']
            game_links.append(link_temp)
        except:
            pass

game_links.remove('http://texassports.com/boxscore.aspx?path=football&id=8599')
game_links.remove('http://stats.texassports.com/sports/m-footbl/2009-2010/ut11.html')
game_links.remove('http://stats.texassports.com/sports/m-footbl/2009-2010/big12fb.html')

for i in tqdm(range(len(game_links)), desc = "Database building..."):
# for i in range(len(game_links)):
    # Get full page soup
    link = game_links[i]
    response = make_request(link)
    temp_box_soup = BeautifulSoup(response, 'html.parser')

    if response == '':
        driver.get(link)
        # driver.get('https://stats.texassports.com/custompages/sports/m-footbl/2009-2010/ut11.html') # for troubleshooting
        temp_box_soup = BeautifulSoup(driver.page_source)
    
    temp_text = temp_box_soup.text
    temp_text = temp_text.replace("Texas Longhorns", "Texas")
    temp_text = temp_text.replace('TEXAS', 'Texas')

    # Get game date
    date_index = temp_text.find('Date: ')
    date_endex = temp_text.find("Site: ")
    date = temp_text[date_index + 6: date_endex - 3].strip()
    date = date.replace(",", "")
    date = date[:-4] + ',' + date[-4:]
    date = date.replace("Sept", "Sep")
    date = date.replace(" ", "")
    try:
        date = datetime.strptime(date, "%b%d,%Y")
    except ValueError:
        date = datetime.strptime(date, "%b.%d,%Y")      

    tables = temp_box_soup.find_all("table")
    score_table = tables[2]
    away_t_row = 1
    home_t_row = 2
    
    away_t_col = 0
    home_t_col = 0

    away_s_col = 5
    home_s_col = 5

    home_team = score_table.find_all('tr')[home_t_row].find_all('td')[home_t_col].text.strip().lower()
    away_team = score_table.find_all('tr')[away_t_row].find_all('td')[away_t_col].text.strip().lower()
    ascore = float(score_table.find_all('tr')[away_t_row].find_all('td')[away_s_col].text.strip())
    hscore = float(score_table.find_all('tr')[home_t_row].find_all('td')[home_s_col].text.strip())

    # did texas win?
    if (home_team == 'texas' and hscore > ascore) or (away_team == 'texas' and hscore < ascore):
        tex_win = "Win"
    elif (home_team == 'texas' and hscore < ascore) or (away_team == 'texas' and hscore > ascore):
        tex_win = "Loss"
    else:
        tex_win = "Tie"     

    if home_team == 'texas':
        rush_table = tables[10]
        pass_table = tables[12]
        rec_table = tables[14]
    else:
        rush_table = tables[9]
        pass_table = tables[11]
        rec_table = tables[13]
    
    # rush stats
    rush_2d = []
    temp = []
    for row in rush_table.find_all('tr')[1:]:
        for val in row.find_all('td'):
            temp.append(val.text.strip())
        if temp[0] == '':
            pass
        else:
            rush_2d.append(temp)
        temp = []

    rush_col = ['Player', 'Rush Attempts', 'Rush Yards Gained', 'Rush Yards Lost', 'Net Rush Yards', 'Rushing TDs', 'Longest Rush', 'Yards Per Rush']

    rush_data = pd.DataFrame(rush_2d)
    rush_data.columns = rush_col

    # pass stats
    pass_2d = []
    temp = []
    for row in pass_table.find_all('tr')[1:]:
        for val in row.find_all('td'):
            temp.append(val.text.strip())
        if temp[0] == '':
            pass
        else:
            pass_2d.append(temp)
        temp = []

    for row in pass_2d:
        new_element = row.pop(1).split('-')
        row[1:1] = new_element

    pass_col = ['Player', 'Completions', 'Pass Attempts', 'Interceptions', 'Pass Yards', 'Passing TDs', 'Longest Pass', 'Sacks Taken']

    pass_data = pd.DataFrame(pass_2d)
    pass_data.columns = pass_col

    # pass stats
    rec_2d = []
    temp = []
    for row in rec_table.find_all('tr')[1:]:
        for val in row.find_all('td'):
            temp.append(val.text.strip())
        if temp[0] == '':
            pass
        else:
            rec_2d.append(temp)
        temp = []

    rec_col = ['Player', 'Catches', 'Receiving Yards', 'Receiving TDs', 'Longest Reception']
            
    frames = [pass_2d, rush_2d, rec_2d]
    for frame in frames:
        for i in range(0,len(frame)):
            for j in range(1,len(frame[i])):
                frame[i][j] = float(frame[i][j])

    rec_data = pd.DataFrame(rec_2d)
    rec_data.columns = rec_col

    full_game_data = pd.merge(pass_data, rush_data, how = "outer", on = "Player")
    full_game_data = pd.merge(full_game_data, rec_data, how = 'outer', on = "Player")
    full_game_data = full_game_data.fillna(0)

    full_game_data = full_game_data.replace('Totals...', 'Total')

    gameid = away_team.replace(" ", "").lower() + '_' + home_team.replace(" ", "").lower() + '_' + str(date.month) + '_' + str(date.day) + '_' + str(date.year)
    gameid_list = [gameid]

    full_game_data['GameID'] = gameid
    full_game_data['Date'] = date
    full_game_data['Home Team'] = home_team
    full_game_data['Away Team'] = away_team
    full_game_data['Home Score'] = hscore
    full_game_data['Away Score'] = ascore
    full_game_data['Texas Result'] = tex_win
    full_game_data['Link'] = link

    master_stats_2 = pd.concat([master_stats_2, full_game_data], ignore_index = True)

    # finally empty out the full game dataframe
    full_game_data = pd.DataFrame() 

master_stats_2.to_csv('master_stats_2.csv', index = False)
# master_games.to_csv('master_games.csv', index = False)

driver.quit()

Database building...: 100%|██████████| 73/73 [01:05<00:00,  1.12it/s]


Get Links from UT master results (list for post 2015)

In [4]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import warnings
from IPython.display import display, HTML
from tqdm import tqdm
import time

# use selenium
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--incognito")
service = Service()
driver = webdriver.Chrome(service=service, options=options)
driver.maximize_window()


driver.get('https://texassports.com/sports/2013/7/21/FB_0721134841.aspx?id=131')
texas_sports_soup = BeautifulSoup(driver.page_source)
driver.quit()
table_list = texas_sports_soup.find_all("table")

# link dictionary
link_dict = {}
for table in table_list[:8]:
    # get year
    temp = table.find('td').get_text()
    year_index = temp.find('\n')
    year = temp[year_index - 4:year_index]

    # get box score links
    rows = table.tbody.find_all('tr')[2:]
    box_score_links = []
    for row in rows:
        box_score = row.find_all('td')[-1]
        try:
            link_temp = box_score.a['href']
            box_score_links.append(link_temp)
        except:
            pass
    
    link_dict[year] = box_score_links

ut website scrape (2015-2022) missing la tech 2019 (plus http://texassports.com/boxscore.aspx?path=football&id=8599)

In [5]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import warnings
from IPython.display import display, HTML
from tqdm import tqdm
import time
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# use selenium
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--incognito")
service = Service()
driver = webdriver.Chrome(service=service, options=options)
driver.maximize_window()

# retry mechanism
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
def make_request(url):
    session = requests.Session()
    retry_strategy = Retry(
        total=3,
        status_forcelist=[500, 502, 503, 504],
        allowed_methods=["GET", "POST"],
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    try:
        response = session.get(url, headers = headers)
        response.raise_for_status()
        return response.text
    except:
        return ''

driver.get('https://texassports.com/sports/2013/7/21/FB_0721134841.aspx?id=131')
texas_sports_soup = BeautifulSoup(driver.page_source)
table_list = texas_sports_soup.find_all("table")

# set up master dataframe
stats = {
    'Player': [],
    'Completions': [],
    'Pass Attempts': [],
    'Interceptions': [],
    'Pass Yards': [],
    'Passing TDs': [],
    'Longest Pass': [],
    'Sacks Taken': [],
    'Rush Attempts': [],
    'Rush Yards Gained': [],
    'Rush Yards Lost': [],
    'Net Rush Yards': [],
    'Rushing TDs': [],
    'Longest Rush': [],
    'Yards Per Rush': [],
    'Catches': [],
    'Receiving Yards': [],
    'Receiving TDs': [],
    'Longest Reception': [],
    'GameID': [],
    'Link': []
}
master_stats_3 = pd.DataFrame(stats)

# link dictionary
links_2015_2022 = []
for table in table_list[:8]:
    # get year
    temp = table.find('td').get_text()
    year_index = temp.find('\n')
    year = temp[year_index - 4:year_index]

    # get box score links
    rows = table.tbody.find_all('tr')[2:]
    for row in rows:
        box_score = row.find_all('td')[-1]
        try:
            link_temp = box_score.a['href']
            links_2015_2022.append(link_temp)
        except:
            pass

# deal with mislinked la tech 2019
links_2015_2022[links_2015_2022.index('hthttps://texassports.com/boxscore.aspx?path=football&id=12601')] = 'https://texassports.com/boxscore.aspx?path=football&id=12601'

# texas vs arkansas 2014
links_2015_2022.append('http://texassports.com/boxscore.aspx?path=football&id=8599')

for i in tqdm(range(len(links_2015_2022)), desc = "Database building..."):
# for i in range(len(links_2015_2022)):
    url = links_2015_2022[i]
    # driver.get(url)
    # driver.implicitly_wait(2) # wait a bit
    # page_source = driver.page_source
    # soup = BeautifulSoup(page_source)

    response = make_request(url)
    soup = BeautifulSoup(response, 'html.parser')

    if response == '':
        # print(url)
        driver.get(url)
        # driver.implicitly_wait(2) # wait a bit
        page_source = driver.page_source
        soup = BeautifulSoup(page_source)

    individual_stats = soup.find('section', id='individual-stats')
    tables = individual_stats.find_all('table')

    score_table = soup.find('table')
    score_table = score_table.find_all('td')
    for i in range(1, len(score_table)):
        try:
            x = int(score_table[i+1].text)
        except:
            ascore = float(score_table[i].text)
            home_team = score_table[i+1].find_all('span', class_='hide-on-small-down')[0].get_text().strip().lower()
            break

    away_team = score_table[0].find_all('span', class_='hide-on-small-down')[0].get_text().strip().lower()
    hscore = float(score_table[-1].text)

    home_team = home_team.replace("Winner", "")
    away_team = away_team.replace("Winner", "")

    if (home_team == 'texas' and hscore > ascore) or (away_team == 'texas' and hscore < ascore):
        tex_win = "Win"
    elif (home_team == 'texas' and hscore < ascore) or (away_team == 'texas' and hscore > ascore):
        tex_win = "Loss"
    else:
        tex_win = "Tie"

    # get date
    big_html = soup.text
    date_index = big_html.find('Date:')
    date_endex = big_html.find('Site:')
    date = big_html[date_index + 6: date_endex].strip()
    date = datetime.strptime(date, "%m/%d/%Y")

    # make gameid
    gameid = away_team.replace(" ", "").lower() + '_' + home_team.replace(" ", "").lower() + '_' + str(date.month) + '_' + str(date.day) + '_' + str(date.year)

    if home_team == 'texas':
        tex_pass = tables[1]
        tex_rush = tables[3]
        tex_rec = tables[5]
    else:
        tex_pass = tables[0]
        tex_rush = tables[2]
        tex_rec = tables[4]

    tex_pass_stats = tex_pass.find_all('td')
    for i in range(len(tex_pass_stats)):  # convert passers to text
        tex_pass_stats[i] = tex_pass_stats[i].text.strip()
    passer_temp = []
    tex_pass_stats_final = []
    for i in range(len(tex_pass_stats)):
        passer_temp.append(tex_pass_stats[i])
        if len(passer_temp)/8 == 1:
            tex_pass_stats_final.append(passer_temp)
            passer_temp = []
    for i in range(len(tex_pass_stats_final)):
        for j in range(1, len(tex_pass_stats_final[i])):
            tex_pass_stats_final[i][j] = float(tex_pass_stats_final[i][j])
    tex_pass_stats_final = pd.DataFrame(tex_pass_stats_final)
    tex_pass_stats_final.columns = ['Player', 'Completions', 'Pass Attempts', 'Pass Yards', 'Passing TDs', 'Interceptions', 'Longest Pass', 'Sacks Taken']

    tex_rush_stats = tex_rush.find_all('td')
    for i in range(len(tex_rush_stats)):  # convert passers to text
        tex_rush_stats[i] = tex_rush_stats[i].text.strip()
    rusher_temp = []
    tex_rush_stats_final = []
    for i in range(len(tex_rush_stats)):
        rusher_temp.append(tex_rush_stats[i])
        if len(rusher_temp)/8 == 1:
            tex_rush_stats_final.append(rusher_temp)
            rusher_temp = []
    for i in range(len(tex_rush_stats_final)):
        for j in range(1, len(tex_rush_stats_final[i])):
            tex_rush_stats_final[i][j] = float(tex_rush_stats_final[i][j])
    tex_rush_stats_final = pd.DataFrame(tex_rush_stats_final)
    tex_rush_stats_final.columns = ['Player', 'Rush Attempts', 'Rush Yards Gained', 'Rush Yards Lost', 'Net Rush Yards', 'Rushing TDs', 'Longest Rush', 'Yards Per Rush']

    tex_rec_stats = tex_rec.find_all('td')
    for i in range(len(tex_rec_stats)):  # convert passers to text
        tex_rec_stats[i] = tex_rec_stats[i].text.strip()
    recer_temp = []
    tex_rec_stats_final = []
    for i in range(len(tex_rec_stats)):
        recer_temp.append(tex_rec_stats[i])
        if len(recer_temp)/5 == 1:
            tex_rec_stats_final.append(recer_temp)
            recer_temp = []
    for i in range(len(tex_rec_stats_final)):
        for j in range(1, len(tex_rec_stats_final[i])):
            tex_rec_stats_final[i][j] = float(tex_rec_stats_final[i][j])
    tex_rec_stats_final = pd.DataFrame(tex_rec_stats_final)
    tex_rec_stats_final.columns = ['Player', 'Catches', 'Receiving Yards', 'Receiving TDs', 'Longest Reception']

    full_game_stats = pd.merge(
        tex_pass_stats_final, tex_rush_stats_final, how = "outer", on = "Player")

    full_game_stats = pd.merge(
        full_game_stats, tex_rec_stats_final, how = 'outer', on = "Player")

    full_game_stats = full_game_stats.fillna(0)

    full_game_stats['GameID'] = gameid
    full_game_stats['Date'] = date
    full_game_stats['Home Team'] = home_team
    full_game_stats['Away Team'] = away_team
    full_game_stats['Home Score'] = hscore
    full_game_stats['Away Score'] = ascore
    full_game_stats['Texas Result'] = tex_win
    full_game_stats['Link'] = url

    # now add it to the master stats
    master_stats_3 = pd.concat([master_stats_3, full_game_stats], ignore_index = True)

    # finally empty out the full game dataframe
    full_game_data = pd.DataFrame() 

master_stats_3.to_csv("master_stats_3.csv", index = False)

driver.quit()

Database building...: 100%|██████████| 100/100 [03:13<00:00,  1.93s/it]


get glitched blue box scores kansas and nebraska 2009 (probably wouldn't have bothered with these if I knew it was just two games but its nice not to do manually)

In [27]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import warnings
from IPython.display import display, HTML
from tqdm import tqdm
import time
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# set up master dataframe
stats = {
    'Player': [],
    'Completions': [],
    'Pass Attempts': [],
    'Interceptions': [],
    'Pass Yards': [],
    'Passing TDs': [],
    'Longest Pass': [],
    'Sacks Taken': [],
    'Rush Attempts': [],
    'Rush Yards Gained': [],
    'Rush Yards Lost': [],
    'Net Rush Yards': [],
    'Rushing TDs': [],
    'Longest Rush': [],
    'Yards Per Rush': [],
    'Catches': [],
    'Receiving Yards': [],
    'Receiving TDs': [],
    'Longest Reception': [],
    'GameID': [],
    'Link': []
}
master_stats_4 = pd.DataFrame(stats)

link_list = ['https://stats.texassports.com/custompages/sports/m-footbl/2009-2010/big12fb.html', # nebraska 2009
             'https://stats.texassports.com/custompages/sports/m-footbl/2009-2010/ut11.html'] # kansas 2009

for i in tqdm(range(len(link_list)), desc = "Database building..."):
    link = link_list[i]
    response = make_request(link)
    temp_box_soup = BeautifulSoup(response, 'html.parser')

    if response == '':
        driver.get(link)
        # driver.get('https://stats.texassports.com/custompages/sports/m-footbl/2009-2010/ut11.html') # for troubleshooting
        temp_box_soup = BeautifulSoup(driver.page_source)

    temp_text = temp_box_soup.text
    temp_text = temp_text.replace("Texas Longhorns", "Texas")
    temp_text = temp_text.replace('TEXAS', 'Texas')

    # Get game date
    date_index = temp_text.find('Date: ')
    date_endex = temp_text.find("Site: ")
    date = temp_text[date_index + 6: date_endex - 3].strip()
    date = date.replace(",", "")
    date = date[:-4] + ',' + date[-4:]
    date = date.replace("Sept", "Sep")
    date = date.replace(" ", "")
    try:
        date = datetime.strptime(date, "%b%d,%Y")
    except ValueError:
        date = datetime.strptime(date, "%b.%d,%Y")      

    tables = temp_box_soup.find_all("table")
    score_table = tables[3]
    away_t_row = 1
    home_t_row = 2

    away_t_col = 0
    home_t_col = 0

    away_s_col = 5
    home_s_col = 5

    home_team = score_table.find_all('tr')[home_t_row].find_all('td')[home_t_col].text.strip().lower()
    away_team = score_table.find_all('tr')[away_t_row].find_all('td')[away_t_col].text.strip().lower()
    ascore = float(score_table.find_all('tr')[away_t_row].find_all('td')[away_s_col].text.strip())
    hscore = float(score_table.find_all('tr')[home_t_row].find_all('td')[home_s_col].text.strip())

    # did texas win?
    if (home_team == 'texas' and hscore > ascore) or (away_team == 'texas' and hscore < ascore):
        tex_win = "Win"
    elif (home_team == 'texas' and hscore < ascore) or (away_team == 'texas' and hscore > ascore):
        tex_win = "Loss"
    else:
        tex_win = "Tie"     

    if home_team == 'texas':
        rush_table = tables[11]
        pass_table = tables[13]
        rec_table = tables[15]
    else:
        rush_table = tables[10]
        pass_table = tables[12]
        rec_table = tables[14]

    # rush stats
    rush_2d = []
    temp = []
    for row in rush_table.find_all('tr')[1:]:
        for val in row.find_all('td'):
            temp.append(val.text.strip())
        if temp[0] == '':
            pass
        else:
            rush_2d.append(temp)
        temp = []

    rush_col = ['Player', 'Rush Attempts', 'Rush Yards Gained', 'Rush Yards Lost', 'Net Rush Yards', 'Rushing TDs', 'Longest Rush', 'Yards Per Rush']

    rush_data = pd.DataFrame(rush_2d)
    rush_data.columns = rush_col

    # pass stats
    pass_2d = []
    temp = []
    for row in pass_table.find_all('tr')[1:]:
        for val in row.find_all('td'):
            temp.append(val.text.strip())
        if temp[0] == '':
            pass
        else:
            pass_2d.append(temp)
        temp = []

    for row in pass_2d:
        new_element = row.pop(1).split('-')
        row[1:1] = new_element

    pass_col = ['Player', 'Completions', 'Pass Attempts', 'Interceptions', 'Pass Yards', 'Passing TDs', 'Longest Pass', 'Sacks Taken']

    pass_data = pd.DataFrame(pass_2d)
    pass_data.columns = pass_col

    # pass stats
    rec_2d = []
    temp = []
    for row in rec_table.find_all('tr')[1:]:
        for val in row.find_all('td'):
            temp.append(val.text.strip())
        if temp[0] == '':
            pass
        else:
            rec_2d.append(temp)
        temp = []

    rec_col = ['Player', 'Catches', 'Receiving Yards', 'Receiving TDs', 'Longest Reception']
            
    frames = [pass_2d, rush_2d, rec_2d]
    for frame in frames:
        for i in range(0,len(frame)):
            for j in range(1,len(frame[i])):
                frame[i][j] = float(frame[i][j])

    rec_data = pd.DataFrame(rec_2d)
    rec_data.columns = rec_col

    full_game_data = pd.merge(pass_data, rush_data, how = "outer", on = "Player")
    full_game_data = pd.merge(full_game_data, rec_data, how = 'outer', on = "Player")
    full_game_data = full_game_data.fillna(0)

    full_game_data = full_game_data.replace('Totals...', 'Total')

    gameid = away_team.replace(" ", "").lower() + '_' + home_team.replace(" ", "").lower() + '_' + str(date.month) + '_' + str(date.day) + '_' + str(date.year)
    gameid_list = [gameid]

    full_game_data['GameID'] = gameid
    full_game_data['Date'] = date
    full_game_data['Home Team'] = home_team
    full_game_data['Away Team'] = away_team
    full_game_data['Home Score'] = hscore
    full_game_data['Away Score'] = ascore
    full_game_data['Texas Result'] = tex_win
    full_game_data['Link'] = link

    numeric_cols = full_game_data.select_dtypes(include='number').columns
    full_game_data[numeric_cols] = full_game_data[numeric_cols].astype(float)

    # now add it to the master stats
    master_stats_4 = pd.concat([master_stats_4, full_game_data], ignore_index = True)

    # finally empty out the full game dataframe
    full_game_data = pd.DataFrame() 

master_stats_4.to_csv("master_stats_4.csv", index = False)

Database building...: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]


2008 (i do not like 2008
        i do not like it in a gate
            i do not like it when it skate
                i do not like 2008)

In [86]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import warnings
from IPython.display import display, HTML
from tqdm import tqdm
import time
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# set up master dataframe
stats = {
    'Player': [],
    'Completions': [],
    'Pass Attempts': [],
    'Interceptions': [],
    'Pass Yards': [],
    'Passing TDs': [],
    'Longest Pass': [],
    'Sacks Taken': [],
    'Rush Attempts': [],
    'Rush Yards Gained': [],
    'Rush Yards Lost': [],
    'Net Rush Yards': [],
    'Rushing TDs': [],
    'Longest Rush': [],
    'Yards Per Rush': [],
    'Catches': [],
    'Receiving Yards': [],
    'Receiving TDs': [],
    'Longest Reception': [],
    'GameID': [],
    'Link': []
}
master_stats_5 = pd.DataFrame(stats)

link_list = ['http://stats.texassports.com/sports/m-footbl/2008-2009/ut1.html',
             'http://stats.texassports.com/sports/m-footbl/2008-2009/ut2.html',
             'http://stats.texassports.com/sports/m-footbl/2008-2009/ut3.html',
             'http://stats.texassports.com/sports/m-footbl/2008-2009/ut4.html',
             'http://stats.texassports.com/sports/m-footbl/2008-2009/ut5.html',
             'http://stats.texassports.com/sports/m-footbl/2008-2009/ut6.html',
             'http://stats.texassports.com/sports/m-footbl/2008-2009/ut7.html',
             'http://stats.texassports.com/sports/m-footbl/2008-2009/ut8.html',
             'http://stats.texassports.com/sports/m-footbl/2008-2009/ut9.html',
             'http://stats.texassports.com/sports/m-footbl/2008-2009/ut10.html',
             'http://stats.texassports.com/sports/m-footbl/2008-2009/ut11.html',
             'http://stats.texassports.com/sports/m-footbl/2008-2009/ut12.html',            
             'http://stats.texassports.com/sports/m-footbl/2008-2009/ut13.html']

# for i in tqdm(range(len(link_list)), desc = "Database building..."):
for i in range(len(link_list)):
    # Get full page soup
    link = link_list[i]
    print(link)
    response = make_request(link)
    temp_box_soup = BeautifulSoup(response, 'html.parser')

    if response == '':
        driver.get(link)
        # driver.get('https://stats.texassports.com/custompages/sports/m-footbl/archive/stats/06/ut11.htm') # for troubleshooting
        temp_box_soup = BeautifulSoup(driver.page_source)
    temp_text = temp_box_soup.text
    temp_text = temp_text.replace("Texas Longhorns", "Texas")
    temp_text = temp_text.replace('TEXAS', 'Texas')

    # Get game date
    date_index = temp_text.find('Date: ')
    date_endex = temp_text.find("Site: ")
    date = temp_text[date_index + 6: date_endex].strip()
    if date == "0ct 10, 1959":
        date = "Oct 10, 1959"
    date = date.replace(",", "")
    date = date[:-4] + ',' + date[-4:]
    date = date.replace("Sept", "Sep")
    date = date.replace(" ", "")
    try:
        date = datetime.strptime(date, "%b%d,%Y")
    except ValueError:
        date = datetime.strptime(date, "%b.%d,%Y")      

    # Get away team
    away_index = temp_text.find('Photo Gallery')
    away_endex = temp_text.find(' vs ')
    away_team = temp_text[away_index + 13: away_endex].strip().lower()

    # Get home team
    home_index = temp_text.find(' vs ')
    home_endex = temp_text.find(' (')
    home_team = temp_text[home_index + 4: home_endex].strip().lower()

    # get away score
    temp_index = temp_text.find('Score by Quarters     1  2  3  4   Score')
    temp_text_new = temp_text[temp_index:]
    temp_index = temp_text_new.find('\n')
    temp_text_new = temp_text_new[temp_index + 1:]
    temp_index = temp_text_new.find('\n')
    temp_text_new = temp_text_new[temp_index + 1:]
    ascore_index = temp_text_new.find(' - ') + 3
    ascore_endex = temp_text_new.find('\n')
    ascore = temp_text_new[ascore_index:ascore_endex].strip()
    ascore = float(ascore)

    # get home score
    temp_text_new = temp_text_new[ascore_endex + 1:]
    hscore_index = temp_text_new.find(' - ') + 3
    hscore_endex = temp_text_new.find('\n')
    hscore = temp_text_new[hscore_index:hscore_endex].strip()
    hscore = float(hscore)

    # did texas win?
    if (home_team == 'texas' and hscore > ascore) or (away_team == 'texas' and hscore < ascore):
        tex_win = "Win"
    elif (home_team == 'texas' and hscore < ascore) or (away_team == 'texas' and hscore > ascore):
        tex_win = "Loss"
    else:
        tex_win = "Tie"

    # get UT box score text
    temp = temp_box_soup.text
    index = temp.find("Individual Statistics")
    temp = temp[index + 21:]
    index = temp.find("Individual Statistics")
    temp = temp[index + 21:]
    index = temp.find("Individual Statistics")
    temp = temp[index + 21:]
    temp = temp.replace('Texas Longhorns', 'Texas')
    temp = temp.replace('TEXAS', 'Texas')

    start = temp.find('Texas')
    temp = temp[start + 5:]
    start = temp.find('Texas')
    temp = temp[start + 5:]
    start = temp.find('Texas')
    temp = temp[start + 5:]
    end = temp.find('Punting               No.  Yds   Avg Long In20')
    temp = temp[start:end]

    # Truncate box score for rushing stats
    rush_start = temp.find('Rushing              No Gain Loss  Net TD Lg  Avg')
    rush_end = temp.find('Passing              ') - 2
    rush_temp = temp[rush_start:rush_end]

    # Get rushing stats
    rush_stats = []    
    line_break = rush_temp.find('\n')
    header = rush_temp[0:line_break].split()
    rush_stats.append(header)
    rush_temp = rush_temp[line_break:]
    line_break = rush_temp.find('\n')
    rush_temp = rush_temp[line_break + 1:]

    # Now rush temp has no header
    line_break = rush_temp.find('\n')
    rush_temp = rush_temp[line_break + 1:]
    while True:
        line_break = rush_temp.find('\n')
        line = rush_temp[0:line_break + 1]
        game_stat = line.split()
        if rush_temp.find('\n') == -1:
            line = rush_temp
            game_stat = line.split()          
            rush_stats.append(game_stat)
            rush_temp = rush_temp[line_break + 1:]
            break
        else:
            # combines first n name columns
            while len(game_stat) > 8: 
                name = game_stat[0] + ' ' + game_stat[1]
                stats = game_stat[2:]
                game_stat = [name] + stats
            rush_stats.append(game_stat)
            rush_temp = rush_temp[line_break + 1:]
    
    # Truncate box score for passing stats
    pass_start = temp.find('Passing              ')
    pass_end = temp.find('Receiving             No.  Yds   TD Long') - 3
    pass_temp = temp[pass_start:pass_end]

    # Get passing stats
    pass_stats = []    
    line_break = pass_temp.find('\n')
    header = pass_temp[0:line_break].split()
    pass_stats.append(header)
    pass_temp = pass_temp[line_break:]
    line_break = pass_temp.find('\n')
    pass_temp = pass_temp[line_break + 1:]

    # Now pass temp has no header
    line_break = pass_temp.find('\n')
    pass_temp = pass_temp[line_break + 1:]
    while True:
        line_break = pass_temp.find('\n')
        line = pass_temp[0:line_break + 1]
        game_stat = line.split()
        if pass_temp.find('\n') == -1:
            line = pass_temp
            game_stat = line.split()
            pass_stats.append(game_stat)
            pass_temp = pass_temp[line_break + 1:]
            break
        else:
            # combines first n name columns
            while len(game_stat) > 6: 
                name = game_stat[0] + ' ' + game_stat[1]
                stats = game_stat[2:]
                game_stat = [name] + stats
            pass_stats.append(game_stat)
            pass_temp = pass_temp[line_break + 1:]
    
    # Truncate box score for rec stats
    rec_start = temp.find('Receiving             No.  Yds   TD Long')
    rec_temp = temp[rec_start:-2]

    # Get rec stats
    rec_stats = []    
    line_break = rec_temp.find('\n')
    header = rec_temp[0:line_break].split()
    rec_stats.append(header)
    rec_temp = rec_temp[line_break:]
    line_break = rec_temp.find('\n')
    rec_temp = rec_temp[line_break + 1:]

    # Now rec temp has no header
    line_break = rec_temp.find('\n')
    rec_temp = rec_temp[line_break + 1:]
    while True:
        line_break = rec_temp.find('\n')
        line = rec_temp[0:line_break + 1]
        game_stat = line.split()
        if rec_temp.find('\n') == -1:
            line = rec_temp
            game_stat = line.split()
            rec_stats.append(game_stat)
            rec_temp = rec_temp[line_break + 1:]
            break
        else:
            # combines first n name columns
            while len(game_stat) > 5: 
                name = game_stat[0] + ' ' + game_stat[1]
                stats = game_stat[2:]
                game_stat = [name] + stats
            rec_stats.append(game_stat)
            rec_temp = rec_temp[line_break + 1:]

    ##############################################################
    # Now that we have the stats in 2d lists, we need to make sure they aren't just strings

    print_table(pass_stats)
    print_table(rush_stats)
    print_table(rec_stats)

    # First, we must address the formatting of the passing cmp-att-int format
    for row in pass_stats:
        new_element = row.pop(1).split('-')
        row[1:1] = new_element
    
    # Next, we must make sure the elements are floats and not strings
    frames = [pass_stats, rush_stats, rec_stats]
    for frame in frames:
        for i in range(1,len(frame)):
            for j in range(1,len(frame[i])):
                frame[i][j] = float(frame[i][j])
                
    # Now, we make the arrays into dataframes using panda
    # admittedly i shouldve done this earlier but oh well
    rush_data = pd.DataFrame(rush_stats[1:])
    rush_data.columns = ['Player', 'Rush Attempts', 'Rush Yards Gained', 'Rush Yards Lost', 'Net Rush Yards', 'Rushing TDs', 'Longest Rush', 'Yards Per Rush']

    pass_data = pd.DataFrame(pass_stats[1:])
    pass_data.columns = ['Player', 'Pass Attempts', 'Completions', 'Interceptions', 'Pass Yards', 'Passing TDs', 'Longest Pass', 'Sacks Taken']

    rec_data = pd.DataFrame(rec_stats[1:])
    rec_data.columns = ['Player', 'Catches', 'Receiving Yards', 'Receiving TDs', 'Longest Reception']
    
    # Finally, time to merge the data into one full dataframe for the full game
    full_game_data = pd.merge(
        pass_data, rush_data, how = "outer", on = "Player"
    )
    full_game_data = pd.merge(
        full_game_data, rec_data, how = 'outer', on = "Player"
    )
    full_game_data = full_game_data.fillna(0)

    # small thing but i want to take the ellipsis out of the totals category
    full_game_data = full_game_data.replace('Totals...', 'Total')

    # now make the gameid
    gameid = away_team.replace(" ", "").lower() + '_' + home_team.replace(" ", "").lower() + '_' + str(date.month) + '_' + str(date.day) + '_' + str(date.year)
    gameid_list = [gameid]

    '''
    game_df = {'Home Team' : [home_team],
            'Away Team' : [away_team],
            'Home Score' : [hscore],
            'Away Score' : [ascore],
            'Texas Result' : [tex_win],
            'Box Score' : [full_game_data]
            }
    
    game_df = pd.DataFrame(game_df, index = gameid_list)

    # finally append it to the master games
    master_games = pd.concat([master_games, game_df], ignore_index = True)

    # empty out game_df
    game_df = pd.DataFrame()
    '''

    ##############################################################
    # the last thing I want to do is to create one large dataframe with every single game performance ever
    # this will contain duplicate players for their different performances in different games
    # much less concise, much more usefull (probably)
    # first add gameid column
    full_game_data['GameID'] = gameid
    full_game_data['Date'] = date
    full_game_data['Home Team'] = home_team
    full_game_data['Away Team'] = away_team
    full_game_data['Home Score'] = hscore
    full_game_data['Away Score'] = ascore
    full_game_data['Texas Result'] = tex_win
    full_game_data['Link'] = link

    # now add it to the master stats
    master_stats_5 = pd.concat([master_stats_5, full_game_data], ignore_index = True)

    # finally empty out the full game dataframe
    full_game_data = pd.DataFrame() 
        
# print(master_stats_1)
master_stats_5.to_csv('master_stats_5.csv', index = False)
# master_games.to_csv('master_games.csv', index = False)

driver.quit()

http://stats.texassports.com/sports/m-footbl/2008-2009/ut1.html
Passing	Att-Cmp-Int	Yds	TD	Long	Sack	
Smith, Rusty	31-15-1	253	1	62	0	
VanCamp, Jeff	8-2-0	2	0	5	0	
Totals...	39-17-1	255	1	62	0	
Rushing	No	Gain	Loss	Net	TD	Lg	Avg	
Edgecomb, D.	4	19	0	19	0	11	4.8	
Floyd, Willie	4	16	1	15	0	15	3.8	
Rose, Willie	3	16	1	15	0	13	5.0	
VanCamp, Jeff	1	6	0	6	0	6	6.0	
Pierre, Charles	6	10	7	3	0	6	0.5	
Blanchard, Jeff	2	3	1	2	0	3	1.0	
Morris, Alfred	1	0	0	0	0	0	0.0	
Smith, Rusty	1	0	0	0	0	0	0.0	
TEAM	1	0	23	-23	0	0	-23.0	
Totals...	23	70	33	37	0	15	1.6	

Receiving	No.	Yds	TD	Long	
Grant, Jamari	4	93	0	62	
Gent, Cortez	3	59	0	33	
Housler, Rob	2	53	1	33	
Bonner, Chris	2	27	0	15	
Rose, Willie	2	24	0	22	
Fick, Carl	1	5	0	5	
Jean, Lester	1	0	0	0	
Johnson, C.	1	-3	0	0	
Edgecomb, D.	1	-3	0	0	
Totals...	17	255	1	62	

http://stats.texassports.com/sports/m-footbl/2008-2009/ut2.html
Passing	Att-Cmp-Int	Yds	TD	Long	Sack	
McCoy, Colt	29-20-1	282	4	39	1	
Totals...	29-20-1	282	4	39	1	


Receiving	No.	Yds	TD	Lon

ValueError: Length mismatch: Expected axis has 0 elements, new values have 8 elements

things i need to fix:
- north texas glitch [done]
- la tech 2019 (maybe this will run now) [done]
- 3 games in 1998 [done]
- arkansas 2014 [done]
- add link to blue tables and new format [done]
- 2008, kansas 2009, nebraska 2009 [2008: X, K2009: Y, N2009: Y]
- performance_id instead of game_id

In [24]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import warnings
from IPython.display import display, HTML
from tqdm import tqdm
import time
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# fill n/as and float the numbers
manual_input_stats = pd.read_csv("texas manual input stats - Sheet1.csv")
manual_input_stats = manual_input_stats.fillna(float(0))
numeric_cols = manual_input_stats.select_dtypes(include='number').columns
manual_input_stats[numeric_cols] = manual_input_stats[numeric_cols].astype(float)

print(manual_input_stats)

   First Name   Last Name  Completions  Pass Attempts  Interceptions  \
0        Colt       McCoy         32.0           41.0            0.0   
1     Garrett     Gilbert          0.0            1.0            0.0   
2         0.0       Total         32.0           42.0            0.0   
3        Tre'      Newton          0.0            0.0            0.0   
4    Vondrell       McGee          0.0            0.0            0.0   
5        Cody     Johnson          0.0            0.0            0.0   
6    Foswhitt   Whittaker          0.0            0.0            0.0   
7    Marquise     Goodwin          0.0            0.0            0.0   
8      Jordan     Shipley          0.0            0.0            0.0   
9       James  Kirkendoll          0.0            0.0            0.0   
10    Malcolm    Williams          0.0            0.0            0.0   
11       Tre'      Newton          0.0            0.0            0.0   
12   Marquise     Goodwin          0.0            0.0           

merge the master stats

In [7]:
import pandas as pd

master_stats_1 = pd.read_csv('master_stats_1.csv')
master_stats_2 = pd.read_csv('master_stats_2.csv')
master_stats_3 = pd.read_csv('master_stats_3.csv')

master_stats = pd.concat([master_stats_1, master_stats_2, master_stats_3], ignore_index=True)
master_stats.to_csv('master_stats.csv', index = False)

clean up
- give first and last name columns
- strip the names
- put names in front

In [13]:
import pandas as pd

def split_names(row):
    full_name = row['Last Name']
    if pd.notna(full_name) and pd.isna(row['First Name']):
        names = full_name.split()
        if len(names) == 2:
            row['First Name'] = names[0]
            row['Last Name'] = names[1]
    return row

master_stats_test = pd.read_csv('master_stats.csv')

# who the fuck did this to johnny walker 4 times i do not understand
master_stats_test['Player'].replace('Walker. Johnny', 'Walker, Johnny', inplace = True)

# split on commas
master_stats_test[['Last Name', 'First Name']] = master_stats_test['Player'].str.split(pat=',', n=1, expand=True)
master_stats_test = master_stats_test.drop('Player', axis=1)

# strip the names
master_stats_test['Last Name'] = master_stats_test['Last Name'].str.strip()
master_stats_test['First Name'] = master_stats_test['First Name'].str.strip()

# fix players with "First Last" Format
master_stats_test = master_stats_test.apply(split_names, axis=1) 

# order the cols
front_columns = ['First Name', 'Last Name']
master_stats_test = master_stats_test[front_columns + [col for col in master_stats_test.columns if col not in front_columns]]

# change totals to total
master_stats_test['Last Name'].replace("Total", "Game", inplace = True)
master_stats_test['Last Name'].replace("Totals", "Game", inplace = True)

master_stats_test.to_csv('master_stats_test.csv', index = False)