**Importing Packages**

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import fnmatch

**Extracting data for dim_match_summary.csv**

In [None]:
cricket_url = 'https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=14450;type=tournament'

"""
The data in the website is stored in table format. Hence, I'm using pandas read_html to get all the data from the tables
and storing it in match_results variable 
"""

all_tables = pd.read_html(cricket_url)

match_results = all_tables[0]
match_results.to_csv("dim_match_summary.csv",index=False)

**Extracting data for fact_batting_summary.csv and fact_bowling_summary.csv** 

In [None]:
"""
The batting and bowling data are stored in a link which is present within the website link stored in website_url.

After retreiving data from website_url, we are extracting 'a' tags from the HTML file, which has a href present in it
and only those links which match with pattern and pattern_1.

Each link in the full_link list contains details about one match. So, we are looping over this list, to get the batting
and bowling summary for each player in every match.

Data from matches which were abandoned or had no result are not included in the csv file.

"""

website_url = 'https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2022-23-1298134/match-schedule-fixtures-and-results'
headers = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36','Accept-Language': 'en-US, en;q=0.5'})

webpage = requests.get(website_url, headers=headers)
webpage_html = BeautifulSoup(webpage.content, "html.parser")

links = []

pattern = '*icc-men-s-t20-world-cup-2022-23-1298134*'
pattern_1 = '*full-scorecard*'

#Filtering only the links that match the patterns.
for link in webpage_html.find_all('a', href=True):
    if fnmatch.fnmatch(link['href'],pattern):
        if fnmatch.fnmatch(link['href'],pattern_1):
            links.append(link['href'])
            
full_link = [] 

for i in links:
    j = 'https://www.espncricinfo.com/' + i
    full_link.append(j)
    
batting_summary = pd.DataFrame()
bowling_summary = pd.DataFrame()

#Looping to extract data from all the matches
for url in full_link:
    
    all_scorecards = pd.read_html(url)
    
    try: #To skip the matches which were abandoned or had no result.
        batting_1 = all_scorecards[0]
        batting_2 = all_scorecards[2]
        
        #Cleaning the data within the loop in order to add a sequence number (battingPosition column) for each match/innings
        #This cannot be done outside the loop currently as there is no way to distinguish an innings in a single match.
        batting_1 = batting_1.dropna(subset='BATTING')
        batting_2 = batting_2.dropna(subset='BATTING')
        
        batting_1 = batting_1[batting_1.BATTING.isin(['TOTAL','Extras']) == False]
        batting_2 = batting_2[batting_2.BATTING.isin(['TOTAL','Extras']) == False]
        
        batting_1 = batting_1[~batting_1.BATTING.str.contains('Did not bat', regex=True)]
        batting_1 = batting_1[~batting_1.BATTING.str.contains('Fall of wickets', regex=True)]
        batting_2 = batting_2[~batting_2.BATTING.str.contains('Did not bat', regex=True)]
        batting_2 = batting_2[~batting_2.BATTING.str.contains('Fall of wickets', regex=True)]

        batting_1['battingPosition'] = range(1,len(batting_1)+1)
        batting_2['battingPosition'] = range(1,len(batting_2)+1)
    
    except:
        continue
    

    try: #To skip the matches which were abandoned or had no result.
        match_info = pd.DataFrame()
        match_info = match_info.append(all_scorecards[4])
        match_info.rename(columns = {0: 'Name'}, inplace = True)
        match_info.rename(columns = {1: 'Value'}, inplace = True)
        match_number = match_info[match_info.Name=='Match number'].Value.item()
    
        batting_summary = batting_summary.append(batting_1) 
        batting_summary = batting_summary.append(batting_2)
        batting_summary = batting_summary.append({'match_number':match_number}, ignore_index=True)
        
        bowling_summary = bowling_summary.append(all_scorecards[1]) 
        bowling_summary = bowling_summary.append(all_scorecards[3])
        bowling_summary = bowling_summary.append({'match_number':match_number}, ignore_index=True)
    except:
        continue


**Final Clean Up and saving the data as a csv file**

In [None]:
#Batting_summary clean up:

batting_summary['match_number'] = batting_summary['match_number'].fillna(method = 'bfill')
batting_summary.rename(columns = {'BATTING': 'batsmanName', 'Unnamed: 1':'wicket','R':'runs','B':'balls','4s':'fours','6s':'sixes'}, inplace = True)
batting_summary = batting_summary.dropna(subset='batsmanName')
batting_summary.drop(['M','Unnamed: 8','Unnamed: 8','Unnamed: 9'], axis = 1, inplace = True)
batting_summary['match_number'] = batting_summary['match_number'].replace("no.","#", regex=True)


#Bowling_summary clean up:

bowling_summary['match_number'] = bowling_summary['match_number'].fillna(method = 'bfill')
bowling_summary = bowling_summary.dropna(subset='BOWLING')
bowling_summary['match_number'] = bowling_summary['match_number'].replace("no.","#", regex=True)
bowling_summary.rename(columns = {'BOWLING': 'bowlerName', 'O':'overs','R':'runs',\
                                  'M':'maiden','4s':'fours','6s':'sixes'\
                                 ,'W':'wickets','ECON':'economy','0s':'zeroes','WD':'wides','NB':'noBalls'}, inplace = True)

for i in bowling_summary['BOWLING']:
    if i[0].isdigit():
        bowling_summary = bowling_summary[bowling_summary.BOWLING.isin([i]) == False]
        
batting_summary.to_csv('fact_batting_summary.csv',index=False)
bowling_summary.to_csv('fact_bowling_summary.csv',index=False)