In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import requests
import re


import warnings
warnings.filterwarnings("ignore")

# Main

In [55]:
class GetScoreCard:
    
    def __init__(self, url):
        self.url = url
        self.web_elements = {
            "day_matches_container": "mt-2",
            "matches_info": "flex flex-col gap-px",
            "begin_address": "https://m.cricbuzz.com",
            "team_1_country_tag": "text-cbTxtSec dark:text-cbTxtSec block wb:hidden whitespace-nowrap",
            "team_2_country_tag": "text-cbTxtPrim dark:text-cbWhite block wb:hidden whitespace-nowrap",
            "team_1_score_tag": "font-medium wb:font-semibold text-cbTxtSec dark:text-cbTxtSec w-1/2",
            "team_2_score_tag": "font-medium wb:font-semibold text-cbTxtPrim dark:text-cbWhite w-1/2",
            "score_match_container": "mb-2",
            "ing_1_scorecard": re.compile(r'scard-team-\d+-innings-1'),
            "ing_1_country": re.compile(r'team-\d+-innings-1'),
            "ing_2_scorecard": re.compile(r'scard-team-\d+-innings-2'),
            "ing_2_country": re.compile(r'team-\d+-innings-2'),
            "batters_container": "scorecard-bat-grid",
            "bowlers_container": "scorecard-bowl-grid",
            "country_tag": "font-bold"
        }
        
    def loadHTMLContent(self, url, element, message):
        driver = webdriver.Chrome()
        driver.get(url)
        
        try:
            element_present = EC.presence_of_element_located((By.CLASS_NAME, element))
            WebDriverWait(driver, 20).until(element_present)
            print(message)
        except TimeoutException:
            print("Timed out waiting for page to load")
        finally:
            html_content = driver.page_source
            html_soup = BeautifulSoup(html_content, "html.parser")
            driver.quit()
        
        return html_soup

    def getMatches(self):        
        page_source = self.loadHTMLContent(self.url, self.web_elements["day_matches_container"], "Matches Page is Ready.")
        day_matches = page_source.find_all("div", class_ = self.web_elements["day_matches_container"])
        root = []
        countries_dict = {
            "United States": "USA",
            "India": "IND",
            "West Indies": "WI",
            "Namibia": "NAM",
            "South Africa": "RSA",
            "Afghanistan": "AFG",
            "Netherlands": "NED",
            "Uganda": "UGA",
            "Australia": "AUS",
            "Scotland": "SCO",
            "Canada": "CAN",
            "Bangladesh": "BAN",
            "Pakistan": "PAK",
            "England": "ENG",
            "New Zealand": "NZ",
            "Sri Lanka": "SL"
        }
        
        for day in day_matches:
            matches = day.find("div", class_ = self.web_elements["matches_info"])
            date = day.find("a").get("title")
            
            for each_match in matches:
                child = {}
                matches_details = []
                link_tag = each_match.find("a")
                outcome = each_match.find("span", class_ = "text-cbComplete dark:text-cbCompleteDark")

                # if link_tag is not None:
                #     if outcome is not None:
                #         outcome_list.append(outcome.text)

                if link_tag is not None and outcome is not None:
                    each_match_url = self.web_elements["begin_address"] + link_tag.get("href")
                    
                    for country in countries_dict:
                        if country in outcome.text:
                            outcome_text = countries_dict[country]
                            
                matches_tag = [self.web_elements["team_1_country_tag"], self.web_elements["team_2_country_tag"], 
                                self.web_elements["team_1_score_tag"], self.web_elements["team_2_score_tag"]]
                
                for tag in matches_tag:
                    tag_info = each_match.find("span", tag)
                    
                    if tag_info:
                        a = tag_info.text
                    else:
                        a = None
                        
                    matches_details.append(a)
                
                child["date"] = date
                child["match_url"] = each_match_url
                child["team_1_country"] = matches_details[0]
                child["team_1_score"] = matches_details[2]
                child["team_2_country"] = matches_details[1]
                child["team_2_score"] = matches_details[3]
                child["match_result"] = outcome_text
                
                root.append(child)
        
        data = pd.DataFrame(root)
        transform_data = data.replace("", np.nan)
        final_data = transform_data.dropna().reset_index()
        
        final_data["date"] = pd.to_datetime(final_data["date"])
        final_data["day"] = final_data["date"].dt.day
        final_data["month"] = final_data["date"].dt.strftime("%B")
        final_data["match_id"] = (final_data["team_1_country"] + "_" + final_data["team_2_country"] + "_" + final_data["day"].astype(str) + "_" + final_data["month"])
        final_data = final_data[["match_id", "match_url", "date", "team_1_country", "team_1_score", "team_2_country", "team_2_score", "match_result"]]
        
        return final_data
    
    def getScoresSourcePage(self):
        data = self.getMatches()
        all_bowlers_data = pd.DataFrame()
        all_batsman_data = pd.DataFrame()
        
        for index, each_url in enumerate(data["match_url"]):
            score_url = each_url.replace("scores", "scorecard")
            score_page = self.loadHTMLContent(score_url, self.web_elements["score_match_container"], "Score Page is Ready.")
            
            batsman_data = self.getBatsmanScores(score_page, data["match_id"][index])
            all_batsman_data = pd.concat([all_batsman_data, batsman_data], axis = 1)

            bowlers_data = self.getBowlerScore(score_page, data["match_id"][index], 
                                               data["team_1_country"][index], data["team_2_country"][index])
            all_bowlers_data = pd.concat([all_bowlers_data, bowlers_data], axis = 1)

        return all_batsman_data.T, all_bowlers_data.T
        
    
    def getBatsmanScores(self, score_soup, c_name):        
        ing_1_score = score_soup.find("div", id = self.web_elements["ing_1_scorecard"])
        ing_1_country = score_soup.find("div", id = self.web_elements["ing_1_country"])
        ing_2_score = score_soup.find("div", id = self.web_elements["ing_2_scorecard"])
        ing_2_country = score_soup.find("div", id = self.web_elements["ing_2_country"])
        
        batters = [ing_1_score.find_all("div", self.web_elements["batters_container"]), 
                   ing_2_score.find_all("div", self.web_elements["batters_container"])]
                                                                 
        batters_country = [ing_1_country.find("div", self.web_elements["country_tag"]), 
                           ing_2_country.find("div", self.web_elements["country_tag"])]

        
        score_card = {}
    
        for index, each_match in enumerate(batters):
            for i in each_match[1:]:
                player_name = i.find("a")
                country_name = batters_country[index].text
                scores = i.find_all("div")
                key = ""
                score_value = []

                if player_name is not None:
                    key = player_name.text

                for s in scores:
                    s_text = s.text.strip()
                    if re.search(r"^\d+(\.\d+)?$", s_text):
                        score_value.append(s.text)

                score_card[key] = {"match_id": c_name, "country": country_name, "run": score_value[0], 
                                   "ball": score_value[1], "4s": score_value[2],
                                   "6s": score_value[3], "srike_rate": score_value[4]}

        return pd.DataFrame(score_card)
    
    def getBowlerScore(self, score_soup, match_id, t1_country, t2_country):
        ing_1_score = score_soup.find("div", id = self.web_elements["ing_1_scorecard"])
        ing_1_country = score_soup.find("div", id = self.web_elements["ing_1_country"])
        ing_2_score = score_soup.find("div", id = self.web_elements["ing_2_scorecard"])
        ing_2_country = score_soup.find("div", id = self.web_elements["ing_2_country"])
        
        bowlers = [ing_1_score.find_all("div", self.web_elements["bowlers_container"]), 
                   ing_2_score.find_all("div", self.web_elements["bowlers_container"])]
                                        
        bowlers_country = [ing_1_country.find("div", self.web_elements["country_tag"]), 
                           ing_2_country.find("div", self.web_elements["country_tag"])]
        
        score_card = {}
    
        for index, each_match in enumerate(bowlers):
            for i in each_match[1:]:
                player_name = i.find("a")
                country_name = bowlers_country[index].text
                scores = i.find_all("div")
                key = ""
                score_value = []
                
                if country_name == t1_country:
                    country_name = t2_country
                else:
                    country_name = t1_country

                if player_name is not None:
                    key = player_name.text

                for s in scores:
                    s_text = s.text.strip()
                    if re.search(r"^\d+(\.\d+)?$", s_text):
                        score_value.append(s.text)

                score_card[key] = {"match_id": match_id, "country": country_name, "over": score_value[0], 
                                   "maiden": score_value[1], "run": score_value[2],
                                   "wicket": score_value[3], "no_ball": score_value[4],
                                  "wide_ball": score_value[5], "economy_rate": score_value[6]}

        return pd.DataFrame(score_card)
        

In [56]:
gs = GetScoreCard("https://m.cricbuzz.com/cricket-series/7476/icc-mens-t20-world-cup-2024/matches")
matches = gs.getMatches()

Matches Page is Ready.


In [8]:
batsman, bowlers = gs.getScoresSourcePage()

Matches Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page is Ready.
Score Page 

In [9]:
bowlers.to_csv("icc_2024_bowlers.csv")
batsman.to_csv("icc_2024_batsman.csv")

In [58]:
matches.to_csv("icc_2024_matches.csv")