In [2]:
import re
import sys
import time
import requests
import numpy as np
import pandas as pd
from sys import exit
from math import floor

import sqlalchemy
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from sqlalchemy.exc import ProgrammingError 
from webdriver_manager.chrome import ChromeDriverManager

### 1. Opens the website of the IPL Season

In [4]:
def extract_matches(year):
    url = 'https://www.espncricinfo.com/ci/engine/series/index.html?search=indian+premier+league;season={};view=season'.format(year)
    driver_1.get(url)                    # Opens the webpage
    driver_1.maximize_window()           # Opens the window to the maximum size

    driver_1.find_element(by=By.CLASS_NAME, value="toggle").click()     
    time.sleep(3)
    url=driver_1.page_source            # Returns the current HTML code
    soup = BeautifulSoup(url, 'html.parser')


    link = soup.find_all('a', string='Scorecard')
    scorecard, commentary = [], []
    for l in link:
        temp = l['href']
        scorecard.append(temp)                                           # Links with the scorecard of matches
        commentary.append(temp.replace('scorecard', 'commentary'))       # Links with the commentary of matches
    return scorecard, commentary

### 2. Extract Players list from given table

In [5]:
def extract_players(table, team):
    players = [team.text]
    for row in table.tbody.find_all('tr'):
        columns = row.find_all('td')
        temp = columns[0].text
        if temp:
            if 'Did not bat:' not in temp:
                if 'Extras' not in temp and 'TOTAL' not in temp and 'Fall of wickets' not in temp: 
                    temp = temp.split('\xa0')[0]          # To remove extra space between names, if any
                    temp = " ".join(temp.split())
                    players.append(temp)                  # Takes the players from the list who have batted
                elif 'TOTAL' in temp:
                    short = [tuple(map(int, columns[2].text.split('/')))]
                    if len(short[0]) < 2:
                        if len(players) < 4:
                            short = [(short[0][0], 0)]
                        else:
                            short = [(short[0][0], 10)]
                    
            else:                                         # Takes the players from the list who didn't bat 
                left = temp.split(': ')[1]
                temp_list = left.split('\xa0')
                temp_list.pop()
                for t in temp_list:
                    new = t.split(',')[0]
                    new = " ".join(new.split())
                    players.append(new)
                    
    last_name = [re.split(' |-', sub)[1:] for sub in players]
    last_name = [' '.join(sub) for sub in last_name]
    first_name = [re.split(' |-', sub)[0] for sub in players]
    players.append(short)
    if 'Muralidaran' in last_name:
        last_name[last_name.index('Muralidaran')] = 'Murali'
    return players, last_name, first_name

### 3. Extract content from scorecard

In [6]:
def extract_contents(table, match_id):                         # Extract the content table on scorecard
    contents = {}
    contents['Match_Id'] = match_id
    not_req = ['Series', 'Season', 'Hours of play (local time)', 'Reserve Umpire', 'Match Referee',
               'Umpires', 'TV Umpire', 'Points', 'T20 debut', 'Series result', 'Player Of The Series']  # Information not reqd
    
    for row in table.tbody.find_all('tr'):         # Finds the rows in a table
        columns = row.find_all('td')               # Finds the columns in a row
        try:
            temp = columns[1].text
            key = columns[0].text
            if key in not_req:
                continue
                
            if key == 'Player Of The Match':
                contents['POTM'] = temp
            
            if key == 'Toss':
                value = temp.split(',')
                if 'bat' in value[1]:
                    value[1] = 'Bat'
                else:
                    value[1] = 'Bowl'
                contents[key] = ', '.join(value)
            
            if key == 'Match days':
                value = temp.split('-')
                value[0] = value[0].strip()
                value[1] = value[1].split()[0].title()
                value.pop()
                contents['Match_days'] = ', '.join(value)
          
        except IndexError:                                  # Because Stadium just has 1 column
            if 'Stadium' not in contents:
                temp = columns[0].text.split(',')[0]
                contents['Stadium'] = temp
            
    return contents

### 4. Extract players of bowling team when one innings is washed out

In [7]:
def one_innings(team, members):
    players = [team]
    members = members.split('\xa0')
    for i in range(len(members)):
        if '(c)' not in members[i] and '†' not in members[i] and members[i]:
            players.append(members[i].split(',')[0])
    last_name = [sub.split()[1:] for sub in players]
    last_name = [' '.join(sub) for sub in last_name]
    first_name = [sub.split()[0] for sub in players]
    return players, last_name, first_name

### 5. Extracts data from scorecard

In [43]:
def extract_scorecard(scorecard):
    match_id = scorecard.split('scorecard/')[1]
    match_id = match_id.split('/')[0]
    try:
        sql = "select * from match_details where Match_Id = {};".format(match_id)
        df = pd.read_sql(sql, conn)
        if len(df) > 0:
            return
    except ProgrammingError:
        pass
    driver_1.get(scorecard)
    url = driver_1.page_source
    soup = BeautifulSoup(url, 'html.parser')
    global players_1, players_2, contents, last_name_1, last_name_2, first_name_1, first_name_2, match_details
    
    table = soup.find_all('table', attrs={'class':'ds-w-full ds-table ds-table-md ds-table-auto ci-scorecard-table'})
    if table:                         # If atleast a ball was bowled
        team = soup.find_all('span', attrs={'class': 'ds-text-title-xs ds-font-bold ds-capitalize'})
        players_1, last_name_1, first_name_1 = extract_players(table[0], team[0])
        if len(table) > 1:            # If both the innings started        
            players_2, last_name_2, first_name_2 = extract_players(table[1], team[1])
        else:                        # If one innings was played
            table = soup.find_all('div', attrs={'class':'ds-text-tight-s ds-font-regular ds-leading-4'})
            team, players = table[-1].text.split(' Team: ')[0], table[-1].text.split(' Team: ')[1]
            players_2, last_name_2, first_name_2 = one_innings(team, players)
    
    table = soup.find('table', attrs={'class': 'ds-w-full ds-table ds-table-sm ds-table-auto'})
    contents = extract_contents(table, int(match_id))
    match_details = pd.concat([match_details, pd.DataFrame([contents])], ignore_index=True)

### 6. If some player have the same first name or surname

In [9]:
def double_player(player, list_player, role):    
    url = driver_2.page_source
    soup = BeautifulSoup(url, 'html.parser')
    text = soup.find('span', attrs={'class':'ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center'})
    check = float(text.text)
    
    while check != ball:
        text = text.find_next('span', attrs={'class':'ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center'})
        check = float(text.text)
    
    if role == 'bowl':
        #Finds the end of the over div
        text = text.find_previous('div', attrs={'class': 'ds-w-1/2 ds-border-l ds-border-line-default-translucent ds-pr-[44px] ds-p-2'})
        if not text and all_matches.iloc[-1, 0] == 1082612:
            double.append('Mohit Sharma')
            return double[-1]
        text = text.find_all('span')
        temp = text[0].text.strip()
        temp = " ".join(temp.split())
        double.append(temp)
        return double[-1]
    
    #Finds the end of the over div
    text = text.find_previous('div', attrs={'class': 'ds-w-1/2 ds-pl-[44px] ds-p-2'})
    if text:
        temp = text.find_all('span')
        if len(temp) > 2 and player in temp[0].text and player in temp[2].text:
            if all_matches.iloc[0, 0] == 980963 and all_matches.iloc[-1, 1] == 2:  # When Mohit Sharma batted before Sandeep Sharma but Sandeep Sharma will have the strike on the next over
                double.append('Mohit Sharma')
                return double[-1]
            elif all_matches.iloc[0, 0] == 1082609 and all_matches.iloc[-1, 1] == 2:  # When Ishant Sharma batted before Sandeep Sharma but Sandeep Sharma will have the strike on the next over
                double.append('Ishant Sharma')
                return double[-1]
        
        if player in temp[0].text:           #If only one batter is present at the end of the over
            new = temp[0].text.strip()
            new = " ".join(new.split())
            if new in strikers:                         #It happens when a new batter has the same name as the player already on the pitch
                if len(temp) > 2 and player in temp[2].text:              #It is to check if the new batter is still on the pitch or out before the end of the over
                    new = temp[2].text.strip()
                    new = " ".join(new.split()) 
                    double.append(new)
                    return double[-1]
                else:
                    double.append(new)
                    return double[-1]    
            else:
                double.append(new)
                return double[-1]
        
        elif len(temp) > 2 and player in temp[2].text:
            new = temp[2].text.strip()
            new = " ".join(new.split())
            double.append(new)
            return double[-1]     
    
    #It is done if the batter is out within the over
    if not text:
        text = soup.find('div', attrs={'class': 'ds-rounded ds-bg-fill-content-alternate ds-ml-4 lg:ds-ml-3 ds-mt-1 ds-inline-block ds-p-3'})
    else:
        text = text.find_next('div', attrs={'class': 'ds-rounded ds-bg-fill-content-alternate ds-ml-4 lg:ds-ml-3 ds-mt-1 ds-inline-block ds-p-3'})
    while text.text:
        out_type = [' c ', ' st ', ' run ', ' lbw ', ' obstruction ', ' hit ',' b ']
        cur_out = None
        for o in out_type:
            if o in text.text:
                cur_out = o
                break
        temp = text.text.split(cur_out)[0]
        if player in temp:
            double.append(temp)
            return double[-1]
        text = text.find_next('div', attrs={'class': 'ds-rounded ds-bg-fill-content-alternate ds-ml-4 lg:ds-ml-3 ds-mt-1 ds-inline-block ds-p-3'})
        
    raise ValueError("Couldn't find the name")

### 7. Returns the full name of the player

In [10]:
def full_name(player, team, role):
    if 'sub ' in player:
        return player
    
    if team == 1:
        list_player = players_1
        last_name = last_name_1
        first_name = first_name_1
    else:
        list_player = players_2
        last_name = last_name_2
        first_name = first_name_2
        
    if player in list_player:
        return player
    
    temp = player.split()
    if len(temp) > 1:
        for i in range(len(last_name)):
            if last_name[i] == temp[1]:
                if temp[0][0] == first_name[i][0]:
                    return list_player[i]
        if player in nicknames:
                return nicknames[player]
    
    if player in last_name:
        if last_name.count(player) > 1:            # If two players has the same last name
            if role != 'f':                 
                if len(double) > 0:
                    if (all_matches.iloc[-1,0], all_matches.iloc[-1, 1]) in [(1082601, 2), (1082609, 1), (1082612, 2), (1082623, 1), (1082641, 2)]:   
                        return double_player(player, list_player, role)                 # Beacuse they used Sharma for both Mohit and Ishant
                    
                    try:
                        temp = [sub for sub in double if player in sub][0]
                        return temp
                    except IndexError:
                        return double_player(player, list_player, role)
                else:
                    return double_player(player, list_player, role)
            else:
                temp = player + '_' + list_player[0]   # For fielder with same name as another player
                if temp in nicknames:
                    return nicknames[temp]
                
            print(player)
            raise ValueError('Human Intervention required')
        return list_player[last_name.index(player)]   
    
    if player in first_name:
        if first_name.count(player) > 1:           # If two players has the same first name
            if role != 'f':
                if len(double) > 0:
                    try:
                        temp = [sub for sub in double if player in sub][0]
                        return temp
                    except IndexError:
                        return double_player(player, list_player, role)
                else:
                    return double_player(player, list_player, role)
            
            print(player)
            raise ValueError('Human Intervention required')
        return list_player[first_name.index(player)]
    
    if player in nicknames:
        return nicknames[player]
    
    temp = player + '_' + list_player[0]   # For fielder with same name as another player
    if temp in nicknames:
        return nicknames[temp]
    
    print(player)
    raise ValueError('Human Intervention required')

### 8. Returns run scored on a ball

In [11]:
def run_scored(ball):
    if ball == 'W':
        return None
    short_form = ['w', 'nb', 'lb', 'b']
    long_form = ['Wide', 'No_Ball', 'Leg_Bye', 'Bye']
    
    if ball == '•':
        run = 0
    
    elif len(ball) > 1:
        if len(ball) > 3:
            if 'n-b' in ball:              # When Bye runs are scored on a no ball
                run = int(ball[0])
                return (run, 'No Ball Bye')
            elif 'n-l' in ball:
                run = int(ball[0])
                return (run, 'No Ball Leg Bye')
            else:
                raise ValueError
        run = int(ball[0])
        extra_type = long_form[short_form.index(ball[1:].lower())]
        return (run, extra_type)
        
    else:
        run = int(ball)
    
    return run

### 9. Returns wicket type

In [12]:
def wicket(team, temp, batsman):
    out_type = [' c ', ' st ', ' run ', ' lbw ', ' obstructing ', ' hit ',' b ']
    cur_out = None
    for o in out_type:
        if o in temp:
            cur_out = o.strip()
            break
            
    if cur_out == 'c':
        person = temp.split(' c ')[1]
        person = person.split(' b ')[0]
        if '&' in person:
            person = temp.split(' c & b ')[1]
            person = re.match(r'[^\d]+', person).group(0)
            person = person.strip()
        if '†' in person:
            if 'sub' not in person:
                person = person[1:]
            situation = (batsman, 'Caught Behind', full_name(person, team%2+1, 'f'))
        else:
            situation = (batsman, 'Caught', full_name(person, team%2+1, 'f'))
                
    elif cur_out == 'st':
        person = temp.split(' st ')[1]
        person = person.split(' b ')[0]
        if 'sub' not in person:
            person = person[1:]
        situation = (batsman, 'Stumped', full_name(person, team%2+1, 'f'))
    
    elif cur_out == 'run':
        person = temp.split('(')[1]
        person = person.split(')')[0]
        if person[0] == '†':
            person = person[1:]
        if '/' in person:
            person = person.split('/')[0]
        situation = (batsman, 'Run Out', full_name(person, team%2+1, 'f'))
        
    elif cur_out == 'b':
        situation = (batsman, 'Bowled')
    
    elif cur_out == 'lbw':
        situation = (batsman, 'LBW')
    
    elif cur_out == 'obstructing':
        situation = (batsman, 'Obstructing the field')
        
    elif cur_out == '':
        situation = (batsman, 'Hit Wicket')
        
    else:
        situation = (batsman, 'Other Type')
        
    return situation
           

### 10. Required when a batter is retired hurt

In [13]:
def retired_hurt(present, team):
    statement = [' off he goes', ' Retired hurt', ' walks off the ground', ' left the field', ' leaves the field', ' walking off', ' helped off the field', ' walks off', ' helped off', ' hobbling off', ' going off', ' goes off', ' gone off', ' retiring hurt', ' walk him off', ' retire', ' retires']
    url = driver_2.page_source
    soup = BeautifulSoup(url, 'html.parser')
    text = soup.find('span', attrs={'class':'ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center'})
    check = float(text.text)
    
    while check != ball:
        text = text.find_next('span', attrs={'class':'ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center'})
        check = float(text.text)
    
    text = text.find_next('p', attrs={'class': "ci-html-content"})
    new = text
    name = None
    while new:
        if ': "' in new.text:
            new = new.find_next('p', attrs={'class': "ci-html-content"})
            continue
            
        boolean = [True if e in new.text else False for e in statement]
        if any(boolean):
            for s in strikers:
                if s in new.text:
                    name = s
                    break
            if not name:
                temp = [s.split()[0] for s in strikers]
                for i in range(len(temp)):
                    if temp[i] in new.text:
                        name = strikers[i]
                        break
            if not name:
                temp = [' '.join(s.split()[1:]) for s in strikers]
                for i in range(len(temp)):
                    if temp[i] in new.text:
                        name = strikers[i]
                        break
            if not name:
                for nick in nicknames:
                    if nick in new.text:
                        name = nicknames[nick]
                        break
            if not name:
                raise ValueError('Human Intervention required')
            
            # To get the ball when the batter got out
            if len(new.attrs['class']) > 1:
                new = new.find_next('span', attrs={'class': "ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center"})
                out_ball = new.text
            else:
                new = new.find_parent('div', attrs={'class': "lg:hover:ds-bg-ui-fill-translucent ds-hover-parent ds-relative"})
                new = new.find('span', attrs={'class': "ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center"})
                out_ball = new.text
            
            #To check if the previous ball in which he got retired out was an extra
            previous_ball = 0
            check = new.find_next('span', attrs={'class': "ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center"})
            check_ball = check.text
            while check_ball == out_ball:
                previous_ball += 1
                check = new.find_next('span', attrs={'class': "ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center"})
                check_ball = check.text
            
            match_id = all_matches.iloc[-1, 0]
            innings = all_matches.iloc[-1, 1]
            k = list(all_matches[(all_matches['Ball']==float(out_ball)) & (all_matches['Match_Id']==match_id) & (all_matches['Innings']==innings)].index)[0]
            k += previous_ball
            all_matches.loc[k, 'Wicket_Type'] = 'Retired Hurt'
            all_matches.loc[k, 'Player_Dismissed'] = name
            
            global retired
            retired.append(name)
            all_matches.loc[k+1:, 'Non_Striker'] = present
            if name == strikers[0]:
                strikers[0] = present
                break
            strikers[1] = strikers[0]
            strikers[0] = present
            break
            
        new = new.find_next('p', attrs={'class': "ci-html-content"})    

### 11. Returns special wickets that include runs

In [14]:
def extra_and_out(present_on_pitch, team):
    global run
    test = ball
    match_id = all_matches.iloc[0, 0]
    if match_id == 829781 and test == 17.6: # Was out on extra but the ball count was wrong
        test = 17.5
    if 'wide' in present_on_pitch[1]:
        temp = present_on_pitch[1][:3]
        temp = temp.replace(' ', '')
        run = run_scored(temp)
        test = round(ball-0.1, 1)
    
    elif 'run' in present_on_pitch[1]:
        run = run_scored(present_on_pitch[1][:1])
    
    elif 'no ball' in present_on_pitch[1]:
        test = round(ball-0.1, 1)
    
    if test == int(test):
        test = int(test)
    url = driver_1.page_source
    soup = BeautifulSoup(url, 'html.parser')
    fall = soup.find_all('div', attrs={'class':'ds-text-tight-s ds-font-regular ds-leading-4'})        # Extracts the fall of wicket
    
    if len(fall) == 4:
        index = team*2-1
    elif len(fall) == 2:
        index = team - 1
    else:
        index = team*(team - 1)                                # When in 1st innings all the batters batted
    
    for span in fall[index].find_all('span'):                 # It traverses the batters and which ball their wicket fell
        if str(test) in span.text:
            temp = span.text.split('(')[1]
            batsman = full_name(temp.split(',')[0], team, 'bat')
            break
    
    try:
        strikers[strikers.index(batsman)] = None
    except UnboundLocalError:                                 # When in 1st innings all the batters batted and the striker is in 2nd innings, Also if all the batters batted in the 2nd innings
        for span in fall[team].find_all('span'):                
            if str(test) in span.text:
                temp = span.text.split('(')[1]
                batsman = full_name(temp.split(',')[0], team, 'bat')
                try:
                    strikers[strikers.index(batsman)] = None
                except ValueError:
                    if batsman in retired:
                        all_matches['Non_Striker']=all_matches['Non_Striker'].fillna('Check Required')  # For very few cases when a batter is retired out and comes back on the non striker end
                        retired.remove(batsman)
                    else:
                        all_matches['Non_Striker']=all_matches['Non_Striker'].fillna(batsman)
                    if all(strikers):
                        retired_hurt(batsman, team)
                    break
    
    except ValueError:
        if batsman in retired:
            all_matches['Non_Striker']=all_matches['Non_Striker'].fillna('Check Required')  # For very few cases when a batter is retired out and comes back on the non striker end
            retired.remove(batsman)
        else:
            all_matches['Non_Striker']=all_matches['Non_Striker'].fillna(batsman)
        if all(strikers):
            retired_hurt(batsman, team)
      
    table = soup.find_all('table', attrs={'class':'ds-w-full ds-table ds-table-md ds-table-auto ci-scorecard-table'})      # Extracts the scorecard of the team
    for row in table[team-1].tbody.find_all('tr'):
        columns = row.find_all('td')
        temp = columns[0].text.strip()
        temp = ' '.join([t for t in temp.split(' ')])
        temp = temp.split('\xa0')[0]          # To remove extra space between names, if any
        temp = " ".join(temp.split())
        
        if temp == batsman:
            temp = batsman + ' ' + columns[1].text           # Gives the wicket detail of the batsmen
            return wicket(team, temp, batsman)         

### 12. Returns the bowler, batter and fielder (If wicket falls)

In [15]:
def players_present(team, strike_ball):
    temp = strike_ball.find('span')
    present_on_pitch = []
    for t in temp:
        present_on_pitch.append(t.text)
    present_on_pitch[0] = present_on_pitch[0].replace(', ', '')       # To remove the comma from the end
    present_on_pitch[1] = present_on_pitch[-1]                        # Sometimes there are more than 2 elements in the list
    
    brief = tuple(map(str, present_on_pitch[0].split(' to ')))
    present_on_pitch[0] = (full_name(brief[1], team, 'bat'),
                           full_name(brief[0], team%2+1, 'bowl'))
    
    if all(strikers):
        if present_on_pitch[0][0] in strikers:
            if present_on_pitch[0][0]!=strikers[0]:
                strikers[1] = strikers[0]
                strikers[0] = present_on_pitch[0][0]
        else:
            retired_hurt(present_on_pitch[0][0], team)
    
    elif not any(strikers):
        strikers[0] = present_on_pitch[0][0]
        match_id = all_matches.iloc[-1, 0]
        data = all_matches[(all_matches['Ball']<ball) & (all_matches['Innings']==team) & (all_matches['Match_Id']==match_id)]
        out = len(data[(data['Wicket_Type'].notna())]) + 1 
        if team == 1:
            p = players_1
        else:
            p = players_2
        if p[out] == present_on_pitch[0][0]:
            all_matches['Non_Striker'] = all_matches['Non_Striker'].fillna(strikers[0])
        elif present_on_pitch[0][0] in retired:
            all_matches.loc[(all_matches['Player_Dismissed']==strikers[0]) & (all_matches['Match_Id']==match_id), 'Wicket_Type'] = None
            all_matches.loc[(all_matches['Player_Dismissed']==strikers[0]) & (all_matches['Match_Id']==match_id), 'Player_Dismissed'] = None
            retired.remove(present_on_pitch[0][0])
            all_matches['Non_Striker'] = all_matches['Non_Striker'].fillna('Check Required')  # For very few cases when a batter is retired hurt and comes back after two wickets have fallen
    
    else:
        if present_on_pitch[0][0] not in strikers:
            if strikers[0]:
                strikers[1] = strikers[0]
                strikers[0] = present_on_pitch[0][0]
            else:
                strikers[0] = present_on_pitch[0][0]
                
            if present_on_pitch[0][0] in retired:
                retired.remove(present_on_pitch[0][0])
                match_id = all_matches.iloc[-1, 0]
                all_matches.loc[(all_matches['Player_Dismissed']==strikers[0]) & (all_matches['Match_Id']==match_id), 'Wicket_Type'] = None
                all_matches.loc[(all_matches['Player_Dismissed']==strikers[0]) & (all_matches['Match_Id']==match_id), 'Player_Dismissed'] = None
                all_matches['Non_Striker']=all_matches['Non_Striker'].fillna('Check Required')  # For very few cases when a batter is retired out and comes back on the non striker end
            else:
                all_matches['Non_Striker']=all_matches['Non_Striker'].fillna(strikers[0]) # Fills the none spots when the new batter is known 
        
        elif not strikers[0]:
            strikers[0] = strikers[1]
            strikers[1] = None 
            
    if present_on_pitch[1]!='OUT' and 'OUT' in present_on_pitch[1]:                         # When a batsman is out and the team also scored runs
        return (present_on_pitch[0], extra_and_out(present_on_pitch, team))
    
    if present_on_pitch[1] == 'OUT':
        result = strike_ball.find('div', attrs={'class': 'ds-rounded ds-bg-fill-content-alternate ds-ml-4 lg:ds-ml-3 ds-mt-1 ds-inline-block ds-p-3'})
        try:
            temp = result.text
        except AttributeError:                         # When the out statement is not given
            return (present_on_pitch[0], extra_and_out(present_on_pitch, team))
        out_type = [' c ', ' st ', ' run ', ' lbw ', ' obstruction ', ' hit ', ' b ']
        cur_out = None
        for o in out_type:
            if o in temp:
                cur_out = o
                break
        batsman = full_name(temp.split(cur_out)[0], team, 'bat')
        try:
            strikers[strikers.index(batsman)] = None
            if batsman in double:
                double.remove(batsman)
        except ValueError:                                             # It is raised when a batter is run out without playing a ball
            if batsman in retired:
                all_matches['Non_Striker']=all_matches['Non_Striker'].fillna('Check Required')  # For very few cases when a batter is retired out and comes back on the non striker end
                retired.remove(batsman)
            else:
                all_matches['Non_Striker']=all_matches['Non_Striker'].fillna(batsman)
            if all(strikers):
                retired_hurt(batsman, team)
                
        try:
            return (present_on_pitch[0], wicket(team, temp, batsman))
        except ValueError:                                             # It is raised when there is some issue on the name of the fielder
            strikers[strikers.index(None)] = batsman
            return (present_on_pitch[0], extra_and_out(present_on_pitch, team))
    
    return present_on_pitch[0]

### 13. Scrolls the webpage to get all the overs

In [16]:
def scroll():
    time.sleep(1.5)
    driver_2.execute_script("window.scrollTo(0, 2800)")
    end = ''
    height = 1800
    new = driver_2.execute_script("var lenOfPage=document.body.scrollHeight;return lenOfPage;")
    while end != 'end of over 1':
        old = new
        driver_2.execute_script("window.scrollTo(0, {})".format(height))
        url = driver_2.page_source
        soup = BeautifulSoup(url, 'html.parser')
        last = soup.find_all('div', attrs={'class':'ds-text-tight-s ds-font-regular ds-mb-3'})
        span = last[-1].find_all('span')
        end = span[0].text
        height += 300
        new = driver_2.execute_script("var lenOfPage=document.body.scrollHeight;return lenOfPage;")
        if height >= new:
            height -= 2100

### 14. Inserts data in dataframe

In [17]:
def insert(match_id, i, situation):
    global all_matches
    record = {}
    record['Match_Id'] = match_id
    record['Innings'] = i
    record['Ball'] = ball
    record['Batting_Team'] = eval('players_{}[0]'.format(i))
    record['Bowling_Team'] = eval('players_{}[0]'.format(i%2+1))
    record['Striker'] = strikers[0]
    record['Non_Striker'] = strikers[1]
    record['Extras'] = 0
    record['Wide'] = 0
    record['No_Ball'] = 0
    record['Leg_Bye'] = 0
    record['Bye'] = 0
    record['Penalty'] = 0
    
    if run:
        if type(run) == int:
            record['Runs_off_bat'] = run
        else:
            if run[1] == 'No_Ball':
                record['Runs_off_bat'] = run[0] - 1
                record[run[1]] = 1
                record['Extras'] = 1
           
            elif run[1] == 'No Ball Bye':              # When Bye runs are scored on a no ball
                record['Bye'] = run[0] - 1
                record['No_Ball'] = 1
                record['Extras'] = run[0]
                record['Runs_off_bat'] = 0
                
            elif run[1] == 'No Ball Leg Bye':
                record['Bye'] = run[0] - 1
                record['No_Ball'] = 1
                record['Extras'] = run[0]
                record['Runs_off_bat'] = 0
                
            else:
                record[run[1]] = run[0]
                record['Runs_off_bat'] = 0
                record['Extras'] = run[0]
    else:
        record['Runs_off_bat'] = 0
        
    if type(situation[0]) == tuple:                       # When a wicket falls
        record['Bowler'] = situation[0][1]
        record['Wicket_Type'] = situation[1][1]
        if len(situation[1]) == 3:
            record['Fielder'] = situation[1][2]
        record['Player_Dismissed'] = situation[1][0]
        
        if not record['Striker']:                        # Fills in the position of the player dismissed
            record['Striker'] = situation[1][0]
        elif not record['Non_Striker']:
            record['Non_Striker'] = situation[1][0]
            
    else:
        record['Bowler'] = situation[1]
        
        
    all_matches = pd.concat([all_matches, pd.DataFrame([record])], ignore_index=True)    

### 15. Checks for Penalty runs

In [18]:
def penalty(team):
    statement = [' penalty runs ', ' hits the helmet ']
    url = driver_2.page_source
    soup = BeautifulSoup(url, 'html.parser')
    text = soup.find('span', attrs={'class':'ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center'})
    check = float(text.text)
    
    while check != ball:
        text = text.find_next('span', attrs={'class':'ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center'})
        check = float(text.text)
    
    text = text.find_next('div', attrs={'class': "lg:hover:ds-bg-ui-fill-translucent ds-hover-parent ds-relative"})
    para = text.find('p', attrs={'class': "ci-html-content"})
    para = para.text
    check = text.find_next('span', attrs={'class':'ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center'})
    check = float(check.text)
    boolean = [True if e in para.lower() else False for e in statement]
    if any(boolean):
        all_matches.loc[all_matches[(all_matches['Ball']==check) & (all_matches['Innings']==team)].index[0], 'Penalty'] = 5
        all_matches.loc[all_matches[(all_matches['Ball']==check) & (all_matches['Innings']==team)].index[0], 'Extras'] += 5
        all_matches.loc[all_matches[(all_matches['Ball']==check) & (all_matches['Innings']==team)].index[0], 'Fielder'] = 'Check Required'
        return
        
    while check > ball-1:
        text = text.find_next('div', attrs={'class': "lg:hover:ds-bg-ui-fill-translucent ds-hover-parent ds-relative"})
        para = text.find('p', attrs={'class': "ci-html-content"})
        para = para.text
        check = text.find_next('span', attrs={'class':'ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center'})
        check = float(check.text)
        boolean = [True if e in para.lower() else False for e in statement]
        if any(boolean):
            all_matches.loc[all_matches[(all_matches['Ball']==check) & (all_matches['Innings']==team)].index[0], 'Penalty'] = 5
            all_matches.loc[all_matches[(all_matches['Ball']==check) & (all_matches['Innings']==team)].index[0], 'Extras'] += 5
            all_matches.loc[all_matches[(all_matches['Ball']==check) & (all_matches['Innings']==team)].index[0], 'Fielder'] = 'Check Required'
            return
    
    all_matches.loc[all_matches[(all_matches['Ball']==ball-1) & (all_matches['Innings']==team)].index[0], 'Penalty'] = 5
    all_matches.loc[all_matches[(all_matches['Ball']==ball-1) & (all_matches['Innings']==team)].index[0], 'Extras'] += 5
    all_matches.loc[all_matches[(all_matches['Ball']==ball-1) & (all_matches['Innings']==team)].index[0], 'Fielder'] = 'Check Required'

### 16. Check for fielder who got hurt and didn't bat

In [19]:
def absent_hurt(team):
    count = 0
    url = driver_1.page_source
    soup = BeautifulSoup(url, 'html.parser')
    batting = soup.find_all('table', attrs={'class': 'ds-w-full ds-table ds-table-md ds-table-auto ci-scorecard-table'})
    for row in batting[team-1].tbody.find_all('tr'):
        col = row.find_all('td')
        if'Extras' in col[0].text or 'Total' in col[0].text:
            break          
        if not col[0].text:
            continue
        if col[1].text == 'absent hurt':
            count += 1
    return count

### 17. Checks the sanity of data after each over

In [20]:
def over_check(team, over, score):
    data = all_matches[(all_matches['Ball']<over) & (all_matches['Innings']==team)]
    total = data['Runs_off_bat'].sum() + data['Extras'].sum()
    out = len(data[(data['Wicket_Type'].notna()) & (data['Wicket_Type']!='Retired Hurt')])
    if len(score) < over:
        over = 0
        
    #print(over-1, score[over-1], (total, out))
    if score[over-1] != (total, out):
        if score[over-1][0] - total == 5:                   # To check for penalty runs
            penalty(team)
            data = all_matches[(all_matches['Ball']<over) & (all_matches['Innings']==team)]
            total = data['Runs_off_bat'].sum() + data['Extras'].sum()
            out = len(data[(data['Wicket_Type'].notna()) & (data['Wicket_Type']!='Retired Hurt')])
            if len(score) < over:
                over = 0
            if score[over-1] != (total, out):
                raise ValueError
            
        elif score[over-1][1] != out:              # To check if someone was hurt during fielding and couldn't bat
            out += absent_hurt(team)
            if score[over-1] != (total, out):
                raise ValueError
                
        else:
            raise ValueError

### 18. Checks the sanity of data after each innings

In [21]:
def scorecard_check(team):
    url = driver_1.page_source
    soup = BeautifulSoup(url, 'html.parser')
    batting = soup.find_all('table', attrs={'class': 'ds-w-full ds-table ds-table-md ds-table-auto ci-scorecard-table'})
    bowling = soup.find_all('table', attrs={'class': 'ds-w-full ds-table ds-table-md ds-table-auto'})
    match_id = all_matches.iloc[-1, 0]
    data = all_matches[(all_matches['Match_Id']==match_id) & (all_matches['Innings']==team)]
    batters = list(data['Striker'].unique()) + list(data['Non_Striker'].unique())
    
    for row in batting[team-1].tbody.find_all('tr'):
        col = row.find_all('td')
        if'Did not bat' in col[0].text or 'Fall of wickets' in col[0].text:
            break          
        if not col[0].text:
            continue
        
        #print(col[0].text.split('\xa0')[0], col[2].text, col[3].text)
        temp = col[0].text.split('\xa0')[0]              # To remove extra space between names, if any
        temp = " ".join(temp.split())
        
        if 'Extras' in col[0].text:
            extras = data['Extras'].sum()
            assert extras == int(col[2].text)
        
        elif 'TOTAL' in col[0].text:
            total = data['Extras'].sum() + data['Runs_off_bat'].sum()
            wicket = data['Wicket_Type'].count() - len(data[data['Wicket_Type']=='Retired Hurt'])
            assert total == int(col[2].text.split('/')[0])
            try:
                assert wicket == int(col[2].text.split('/')[1])
            except IndexError:
                try:
                    assert wicket == 10
                except AssertionError:
                    try:
                        assert wicket == 0
                    except AssertionError:
                        wicket += absent_hurt(team)
                        assert wicket == 10
        
        elif temp in batters:
            striker = data[data['Striker']==temp]
            runs = striker['Runs_off_bat'].sum()
            balls = striker['Ball'].count() - len(striker[striker['Wide']>0])
            #print(runs, balls)
            try:                                     # To check if someone was hurt during fielding and couldn't bat
                assert runs == int(col[2].text)
                assert balls == int(col[3].text)
            except ValueError:
                pass
        
        elif len(data[data['Non_Striker'].isna()]) > 0:
            all_matches['Non_Striker'] = all_matches['Non_Striker'].fillna(temp)
    
    for row in bowling[team-1].tbody.find_all('tr'):
        col = row.find_all('td')
        if len(col)<2:
            continue
            
        temp = col[0].text.split('\xa0')[0]              # To remove extra space between names, if any
        temp = " ".join(temp.split())
        
        #print(temp, col[1].text, col[3].text, col[4].text)
        bowler = data[data['Bowler']==temp]
        balls = bowler[(bowler['Wide']==0) & (bowler['No_Ball']==0)]
        overs = round(int(balls['Ball'].count()/6) + balls['Ball'].count()%6*0.1, 1)
        runs = bowler['Runs_off_bat'].sum() + bowler['Extras'].sum() - bowler['Leg_Bye'].sum() - bowler['Bye'].sum() - bowler['Penalty'].sum()
        wickets = len(bowler[bowler['Wicket_Type'].notna()]) - len(bowler[(bowler['Wicket_Type']=='Retired Hurt') | (bowler['Wicket_Type']=='Run Out') | (bowler['Wicket_Type']=='Obstructing the field') | (bowler['Wicket_Type']=='Retired Out')])
        
        #print(overs, runs, wickets)
        assert overs == float(col[1].text)
        assert runs == int(col[3].text)
        assert wickets == int(col[4].text)

### 19. Corrects human made errors on the website

In [22]:
def human_error(team):
    if all_matches.iloc[0, 0]==335987:
        all_matches.loc[all_matches[(all_matches['Bowler']=='Dinesh Salunkhe') & (all_matches['Ball']==7.5)].index[1], 'Ball'] = 7.6
    
    elif all_matches.iloc[0, 0]==335994:
        all_matches.drop(all_matches[(all_matches['Ball']==10.7)].index, inplace=True)
    
    elif all_matches.iloc[0, 0]==336029:            # Error in counting the balls played by VVS Laxman
        pass  
    
    elif all_matches.iloc[0, 0]==392198:
        all_matches.drop(all_matches[(all_matches['Ball']==10.7)].index, inplace=True)
    
    elif all_matches.iloc[0, 0]==336015:
        all_matches.loc[all_matches[(all_matches['Bowler']=='Amit Mishra') & (all_matches['Ball']==14.5)].index[1], 'Ball'] = 14.6
    
    elif all_matches.iloc[0, 0]==392229:             # When RP Singh came into bat and was out before the end of the over and Jaskaran Singh came to bat
        all_matches.loc[(all_matches['Striker']=='Jaskaran Singh'), 'Striker'] = 'RP Singh'    
        
    elif all_matches.iloc[0, 0]==419133:          # Error in counting the balls bowled by Vinay Kumar
        pass
    
    elif all_matches.iloc[0, 0]==419155:
        all_matches.drop(all_matches[(all_matches['Ball']==18.7)].index, inplace=True)
    
    elif all_matches.iloc[0, 0]==501202:         # The 6th ball of the 6th over wasn't bowled           
        pass
    
    elif all_matches.iloc[0, 0]==501226:
        all_matches.loc[all_matches[(all_matches['Striker']=='Robin Uthappa') & (all_matches['Ball']==4.2)].index[0], 'Runs_off_bat'] = 0
        all_matches.loc[all_matches[(all_matches['Striker']=='Robin Uthappa') & (all_matches['Ball']==4.2)].index[0], 'Bye'] = 1
        all_matches.loc[all_matches[(all_matches['Striker']=='Robin Uthappa') & (all_matches['Ball']==4.2)].index[0], 'Extras'] = 2
    
    elif all_matches.iloc[0, 0]==501233:
        all_matches.loc[all_matches[(all_matches['Striker']=='Travis Birt') & (all_matches['Ball']==15.1)].index[0], 'Runs_off_bat'] = 0
        all_matches.loc[all_matches[(all_matches['Striker']=='Travis Birt') & (all_matches['Ball']==15.1)].index[0], 'Bye'] = 1
        all_matches.loc[all_matches[(all_matches['Striker']=='Travis Birt') & (all_matches['Ball']==15.1)].index[0], 'Extras'] = 2
    
    elif all_matches.iloc[0, 0]==501255:      # The 6th ball of the 10th over wasn't bowled
        pass
    
    elif all_matches.iloc[0, 0]==501258:
        all_matches.loc[all_matches[(all_matches['Bowler']=='Padmanabhan Prasanth')].index, 'Bowler'] = 'Prasanth Parameswaran'
        all_matches.loc[all_matches[(all_matches['Ball']>7) & (all_matches['Ball']<8)].index, 'Bowler'] = 'Padmanabhan Prasanth'
        
    elif all_matches.iloc[0, 0]==548318:
        all_matches.loc[(all_matches['Ball']==13.4) & (all_matches.Innings==2), 'Runs_off_bat'] = 1
        all_matches.loc[(all_matches['Ball']==13.4) & (all_matches.Innings==2), 'Leg_Bye'] = 0
        all_matches.loc[(all_matches['Ball']==13.4) & (all_matches.Innings==2), 'Extras'] = 0
        
    elif all_matches.iloc[0, 0]==548325:
        all_matches.loc[all_matches[(all_matches['Striker']=='Pragyan Ojha') & (all_matches['Ball']==16.6)].index[0], 'Runs_off_bat'] = 0
        all_matches.loc[all_matches[(all_matches['Striker']=='Pragyan Ojha') & (all_matches['Ball']==16.6)].index[0], 'Bye'] = 1
        all_matches.loc[all_matches[(all_matches['Striker']=='Pragyan Ojha') & (all_matches['Ball']==16.6)].index[0], 'Extras'] = 2
    
    elif all_matches.iloc[0, 0]==548376:                # Saurabh Tiwary was retired hurt but it was not reported
        all_matches.loc[all_matches[(all_matches['Ball'].isin([6.1, 6.3, 6.5, 6.6, 7.4, 7.5, 7.6, 8.1, 8.2, 8.3, 9.2])) & (all_matches.Innings==2)].index, 'Striker'] = 'AB de Villiers'
        all_matches.loc[all_matches[(all_matches['Ball'].isin([6.1, 6.3, 6.5, 6.6, 7.4, 7.5, 7.6, 8.1, 8.2, 8.3, 9.2])) & (all_matches.Innings==2)].index, 'Non_Striker'] = 'Virat Kohli'
        all_matches.loc[all_matches[(all_matches['Ball'].isin([9.3, 9.5])) & (all_matches.Innings==2)].index, 'Striker'] = 'Mayank Agarwal'
        all_matches.loc[all_matches[(all_matches['Ball'].isin([9.3, 9.5])) & (all_matches.Innings==2)].index, 'Non_Striker'] = 'Virat Kohli'
    
    elif all_matches.iloc[0, 0]==598028:
        all_matches.loc[all_matches[(all_matches['Ball']==14.5) & (all_matches.Innings==2)].index[0], 'Runs_off_bat'] = 0
        all_matches.loc[all_matches[(all_matches['Ball']==14.5) & (all_matches.Innings==2)].index[0], 'Bye'] = 2
        all_matches.loc[all_matches[(all_matches['Ball']==14.5) & (all_matches.Innings==2)].index[0], 'Extras'] = 3
        
    elif all_matches.iloc[0, 0]==598072:
        all_matches.loc[all_matches[(all_matches['Ball']==19.5) & (all_matches.Innings==1)].index[1], 'Runs_off_bat'] = 0
        all_matches.loc[all_matches[(all_matches['Ball']==19.5) & (all_matches.Innings==1)].index[1], 'Bye'] = 1
        all_matches.loc[all_matches[(all_matches['Ball']==19.5) & (all_matches.Innings==1)].index[1], 'Extras'] = 2
        
    elif all_matches.iloc[0, 0]==733991:
        all_matches.loc[all_matches[(all_matches['Ball']==2.6) & (all_matches.Innings==1)].index[0], 'Runs_off_bat'] = 0
        all_matches.loc[all_matches[(all_matches['Ball']==2.6) & (all_matches.Innings==1)].index[0], 'Leg_Bye'] = 1
        all_matches.loc[all_matches[(all_matches['Ball']==2.6) & (all_matches.Innings==1)].index[0], 'Extras'] = 2
        
    elif all_matches.iloc[0, 0]==733993:
        all_matches.loc[all_matches[(all_matches.Ball>4) & (all_matches.Innings==2)].index, 'Bowler'] = 'Laxmi Shukla'
        
    elif all_matches.iloc[0, 0]==829811:
        all_matches.loc[all_matches[(all_matches['Ball']==3.6) & (all_matches.Innings==2)].index[0], 'Runs_off_bat'] = 0
        all_matches.loc[all_matches[(all_matches['Ball']==3.6) & (all_matches.Innings==2)].index[0], 'Bye'] = 4
        all_matches.loc[all_matches[(all_matches['Ball']==3.6) & (all_matches.Innings==2)].index[0], 'Extras'] = 5
    
    elif all_matches.iloc[0, 0]==980951:
        all_matches.loc[all_matches[(all_matches['Ball']==9.6) & (all_matches.Innings==2)].index[1], 'Runs_off_bat'] = 0
        all_matches.loc[all_matches[(all_matches['Ball']==9.6) & (all_matches.Innings==2)].index[1], 'Bye'] = 4
        all_matches.loc[all_matches[(all_matches['Ball']==9.6) & (all_matches.Innings==2)].index[1], 'Extras'] = 5
        
    elif all_matches.iloc[0, 0]==1082605:
        all_matches.loc[all_matches[(all_matches['Ball']==16.2) & (all_matches.Innings==1)].index[0], 'Runs_off_bat'] = 0
        all_matches.loc[all_matches[(all_matches['Ball']==16.2) & (all_matches.Innings==1)].index[0], 'Bye'] = 1
        all_matches.loc[all_matches[(all_matches['Ball']==16.2) & (all_matches.Innings==1)].index[0], 'Extras'] = 2
    
    elif all_matches.iloc[0, 0]==1082625:
        all_matches.loc[all_matches[(all_matches['Ball']==18.2) & (all_matches.Innings==1)].index[0], 'Runs_off_bat'] = 0
        all_matches.loc[all_matches[(all_matches['Ball']==18.2) & (all_matches.Innings==1)].index[0], 'Bye'] = 1
        all_matches.loc[all_matches[(all_matches['Ball']==18.2) & (all_matches.Innings==1)].index[0], 'Extras'] = 2
        
    elif all_matches.iloc[0, 0]==1082645:    
        all_matches.loc[all_matches[(all_matches['Ball'].isin([13.5, 13.6, 14.5, 14.6, 15.1])) & (all_matches.Innings==1)].index, 'Striker'] = 'Ishant Sharma'
        all_matches.loc[all_matches[(all_matches['Ball'].isin([13.5, 13.6, 14.5, 14.6, 15.1])) & (all_matches.Innings==1)].index, 'Non_Striker'] = 'Mohit Sharma'
    
    elif all_matches.iloc[0, 0]==1136564:   #An umpiring error which caused to ball a 7th ball in the 11th over
        pass
    
    elif all_matches.iloc[0, 0]==1178417:
        all_matches.loc[all_matches[(all_matches.Ball.isin([19.1, 19.2])) & (all_matches.Innings==2)].index, 'Striker'] = 'Ravichandran Ashwin'
        all_matches.loc[all_matches[(all_matches.Ball.isin([19.3])) & (all_matches.Innings==2)].index, 'Striker'] = 'Hardus Viljoen'
        
    elif all_matches.iloc[0, 0]==1181766:
        all_matches.loc[all_matches[(all_matches.Ball==19.5) & (all_matches.Innings==1)].index[0], 'Wicket_Type'] = 'Run Out'
        all_matches.loc[all_matches[(all_matches.Ball==19.5) & (all_matches.Innings==1)].index[0], 'Player_Dismissed'] = 'Deepak Hooda'
        all_matches.loc[all_matches[(all_matches.Ball==19.5) & (all_matches.Innings==1)].index[0], 'Fielder'] = 'Rishabh Pant'
    
    elif all_matches.iloc[0, 0]==1304066:
        all_matches.loc[(all_matches.Ball==18.2) & (all_matches.Innings==1), 'Wicket_Type'] = 'Retired Out'
    
    elif all_matches.iloc[0, 0]==1304101:
        all_matches.loc[all_matches[(all_matches.Ball.isin([9.2, 9.3, 9.4])) & (all_matches.Innings==2)].index, 'Striker'] = 'Ripal Patel'
        
    else:
        raise AssertionError

### 20. Returns Ball by Ball Data

In [23]:
def ball_by_ball(comm):
    global strikers, ball, run, retired, double
    for i in range(1, 3):
        match_id = comm.split('commentary/')[1]
        match_id = int(match_id.split('/')[0])
        driver_2.maximize_window()                 # Need to maximize window because certain elements doesn't load when minimized
        driver_2.execute_script("window.scrollTo(0, 200)")
        driver_2.get(comm)
        driver_2.find_element(by=By.XPATH, value="//div[@class='ds-grow']//div[@class='ds-p-4']//div[2]//div[1]//div[1]//i[1]").click()
        driver_2.find_element(by=By.XPATH, value="(//li[@class='ds-w-full ds-flex'])[{}]".format(i)).click()
        scroll()
        
        url = driver_2.page_source
        soup = BeautifulSoup(url, 'html.parser')
        score = soup.find_all('div', attrs={'class': 'lg:ds-flex lg:ds-items-center lg:ds-px-2'})            # Balls and runs scored
        striker_baller = soup.find_all('div', attrs={'class': 'xl:ds-w-[730px]'})                            # Bowler and batsman data per ball
        score.reverse()
        striker_baller.reverse()
        over = soup.find_all('span', attrs={'class': 'ds-text-tight-m ds-font-bold'})
        over = over[:-2]
        for k in range(len(over)):
            over[k] = over[k].text.split()[1]
            over[k] = tuple(map(int, over[k].split('/')))
        over.reverse()
        
        strikers, retired, double = [], [], []
        for j in range(len(score)):
            if len(strikers) == 0:
                strikers.append(eval('players_{}[1]'.format(i)))
                strikers.append(eval('players_{}[2]'.format(i)))
            
            ball = score[j].find_all('span')
            run = run_scored(ball[1].text)
            ball = float(ball[0].text)
            situation = players_present(i, striker_baller[j])
            insert(match_id, i, situation)
            
            if round(ball - floor(ball),1) == 0.1 and ball >= 1:
                over_check(i, floor(ball), over)
            if j == len(score) - 1:
                over_check(i, floor(ball)+1, eval('players_{}[-1]'.format(i)))
            #print(i, ball, run, eval('players_{}[0]'.format(i)), eval('players_{}[0]'.format(i%2+1)), situation, strikers[1])
        try:
            scorecard_check(i)
        except AssertionError:
            human_error(i)
    

### Nicknames

In [None]:
nicknames = {'Bhajji': 'Harbhajan Singh', 'Zak': 'Zaheer Khan', 'ABD': 'AB de Villiers', 'de Villiers': 'AB de Villiers', 'De Villiers': 'AB de Villiers', 'Venky':'Venkatesh Iyer', 
             'KP': 'Kevin Pietersen', 'M Ashwin': 'Murugan Ashwin', 'AD Mascarenhas':'Dimitri Mascarenhas', 'Muralidaran': 'Muthiah Muralidaran', 'Duminy': 'Jean-Paul Duminy', 
             'Ahmed': 'Abu Nechim', 'JJ van der Wath': 'Johan van der Wath', 'J Syed Mohammad': 'Jamaluddin Syed Mohammad', 'Y Gnaneswara Rao': 'Gnaneswara Rao', 
             'Pratap': 'Veer Pratap Singh', 'X Thalaivan Sargunam': 'Thalaivan Sargunam', 'Coulter-Nile': 'Nathan Coulter-Nile', 'Karan': 'Karanveer Singh', 
             'JPR Scantlebury-Searles': 'Javon Searles', 'Scantlebury-Searles': 'Javon Searles', 'Gurkeerat Singh': 'Gurkeerat Singh Mann', 'Bawa': 'Raj Bawa', 'Raj Bawa': 'Raj Bawa',
             'B Sai Sudharsan': 'Sai Sudharsan', 'R Sanjay Yadav': 'Sanjay Yadav',
             
             'Kumar_Royal Challengers Bangalore': 'Praveen Kumar', 'Singh_Kings XI Punjab': 'Vikram Singh', 'Singh_Deccan Chargers': 'RP Singh', 'Singh_Mumbai Indians': 'RP Singh',
             'Reddy_Sunrisers Hyderabad': 'Akshath Reddy', 'Sharma_Sunrisers Hyderabad': 'Karn Sharma', 'de Silva_Royal Challengers Bangalore': 'Wanindu Hasaranga de Silva',
             'Sharma_Kings XI Punjab': 'Mohit Sharma', 'Yadav_Kolkata Knight Riders': 'Suryakumar Yadav', 'Sharma_Rising Pune Supergiants': 'Ishant Sharma', 
             'Sharma_Punjab Kings': 'Jitesh Sharma',
            }

### SqlAlchemy Connector

In [None]:
engine = 'mysql+mysqldb://root:password@localhost:3306/ipl'   # Change according to your local database
conn = sqlalchemy.create_engine(engine)

### Selenium Drivers

In [25]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--blink-settings=imagesEnabled=false')            # Adds the argument to disable images on a webpage
driver_1 = webdriver.Chrome(options=chrome_options)                            # Creates a webdriver instance to access the websites
driver_2 = webdriver.Chrome(options=chrome_options)

### Scraping the IPL years

In [None]:
response = requests.get('https://www.espncricinfo.com/ci/engine/series/index.html?search=indian+premier+league;view=season')
soup = BeautifulSoup(response.content, 'html.parser')

year = []
year_srp = soup.find_all('span', attrs={'class':'year'}) 
for row in year_srp:
    year.append(row.text.replace('/','%2F'))    # Extracting the years where IPL happened
print("\nYears = ", year)


Years =  ['2007%2F08', '2009', '2009%2F10', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020%2F21', '2021', '2022']


### Main Code

In [48]:
for i in year:
    scorecard, commentary = extract_matches(i)
    for j in range(len(scorecard)):
        all_matches = pd.DataFrame(columns=['Match_Id', 'Innings', 'Ball', 'Batting_Team', 'Bowling_Team', 'Striker', 'Non_Striker', 'Bowler', 'Runs_off_bat', 'Extras', 'Wide', 'No_Ball', 'Leg_Bye', 'Bye', 'Penalty', 'Wicket_Type', 'Player_Dismissed', 'Fielder'])
        match_details = pd.DataFrame(columns=['Match_Id', 'Stadium', 'Toss', 'POTM', 'Match_days'])
        extract_scorecard(scorecard[j])
        if len(match_details) == 0:
            continue
        try:       
            ball_by_ball(commentary[j])
            all_matches.to_sql(con=conn, name='all_matches', if_exists='append', index=False)
        except NoSuchElementException:                       # If the match is abandoned
            if len(all_matches) > 0:
                all_matches.to_sql(con=conn, name='all_matches', if_exists='append', index=False)
                
        match_details.to_sql(con=conn, name='match_details', if_exists='append', index=False)
        print(j+1, 'Done')
    print(i, 'Done................................................................................')

2007%2F08 Done................................................................................
2009 Done................................................................................
2009%2F10 Done................................................................................
2011 Done................................................................................
2012 Done................................................................................
2013 Done................................................................................
2014 Done................................................................................
18 Done
25 Done
29 Done
2015 Done................................................................................
2016 Done................................................................................
29 Done
2017 Done................................................................................
45 Done
2018 Done.........................................

### For debugging

In [29]:
s = 'https://www.espncricinfo.com/series/8048/scorecard/1136605/delhi-daredevils-vs-royal-challengers-bangalore-45th-match-indian-premier-league-2018'
c = 'https://www.espncricinfo.com/series/8048/commentary/1136605/delhi-daredevils-vs-royal-challengers-bangalore-45th-match-indian-premier-league-2018'

In [35]:
match_details = pd.DataFrame(columns=['Match Id', 'Stadium', 'Toss', 'Player Of The Match', 'Match days'])
extract_scorecard(s)

In [38]:
for i in range(2, 3):
    driver_2.get(c)
    match_id = c.split('commentary/')[1]
    match_id = int(match_id.split('/')[0])
    driver_2.maximize_window()
    driver_2.execute_script("window.scrollTo(0, 200)")
    driver_2.find_element(by=By.XPATH, value="//div[@class='ds-grow']//div[@class='ds-p-4']//div[2]//div[1]//div[1]//i[1]").click()
    driver_2.find_element(by=By.XPATH, value="(//li[@class='ds-w-full ds-flex'])[{}]".format(i)).click()
    scroll()
        
    url = driver_2.page_source
    soup = BeautifulSoup(url, 'html.parser')
    score = soup.find_all('div', attrs={'class': 'lg:ds-flex lg:ds-items-center lg:ds-px-2'})
    striker_baller = soup.find_all('div', attrs={'class': 'xl:ds-w-[730px]'})
    score.reverse()
    striker_baller.reverse()
    over = soup.find_all('span', attrs={'class': 'ds-text-tight-m ds-font-bold'})
    over = over[:-2]
    for k in range(len(over)):
        over[k] = over[k].text.split()[1]
        over[k] = tuple(map(int, over[k].split('/')))
    over.reverse()

In [39]:
all_matches = pd.DataFrame(columns=['Match_Id', 'Innings', 'Ball', 'Batting_Team', 'Bowling_Team', 'Striker', 'Non_Striker', 'Bowler', 'Runs_off_bat', 'Extras', 'Wide', 'No_Ball', 'Leg_Bye', 'Bye', 'Penalty', 'Wicket_Type', 'Player_Dismissed', 'Fielder'])
strikers, retired, double = [], [], []
for k in range(len(score)):
        if len(strikers) == 0:
            strikers.append(eval('players_{}[1]'.format(i)))
            strikers.append(eval('players_{}[2]'.format(i)))
        ball = score[k].find_all('span')
        run = run_scored(ball[1].text)
    
        ball = float(ball[0].text)
        #print(ball, end='\t')
        situation = players_present(i, striker_baller[k])
        #print(strikers, ball)
        insert(match_id, i, situation)
        #print(strikers)
        #print(situation, strikers)
        #print(ball, floor(ball), round(ball - floor(ball),1))
        if round(ball - floor(ball), 1) == 0.1 and ball >= 1:
            over_check(i, floor(ball), over)
        if k == len(score) - 1:
            over_check(i, floor(ball)+1, eval('players_{}[-1]'.format(i)))
try:
    scorecard_check(i)
except AssertionError:
    human_error(i)