In [None]:
# Install necessary packages (if not already installed)
!pip install selenium
!apt-get update
!apt install -y chromium-chromedriver

# Imports
import re
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup


Collecting selenium
  Downloading selenium-4.22.0-py3-none-any.whl (9.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.0-py3-none-any.whl (475 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.7/475.7 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Function to initialize Selenium WebDriver with Chrome
def initialize_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    return webdriver.Chrome(options=options)

# Function to scrape table data from a page
def scrape_page_data(driver, url):
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    table = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table")))
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, "html.parser")
    rows = soup.select("table tr")
    data = []
    for row in rows[1:]:
        row_data = [cell.get_text(strip=True) for cell in row.select("td")]
        if row_data and len(row_data) > 1 and row_data[0] != 'Page1of2018':
            data.append(row_data)
    return data

# Function to clean data by removing specified patterns
def clean_data(data):
    cleaned_data = []
    for row in data:
        if all(isinstance(cell, str) for cell in row):  # Ensure all cells are strings
            if len(row) >= 11:  # Ensure row has at least 11 columns (adjust as needed)
                cleaned_data.append(row)
    cleaned_data_array = np.array(cleaned_data)
    return cleaned_data_array

# Function to convert cleaned data to DataFrame
def convert_to_dataframe(cleaned_data_array):
    columns = ["Team", "ScoreDescending", "Overs", "RPO", "Lead", "Inns", "Result", "", "Opposition", "Ground", "Start Date"]
    cleaned_data_array = [row[:len(columns)] for row in cleaned_data_array]
    return pd.DataFrame(cleaned_data_array, columns=columns)


In [None]:
def main():
    base_url = "https://stats.espncricinfo.com"
    url = "/ci/engine/stats/index.html?class=1;home_or_away=1;home_or_away=2;home_or_away=3;result=1;result=2;result=3;result=4;template=results;type=team;view=innings"
    full_url = base_url + url

    # Initialize Selenium WebDriver
    driver = initialize_driver()

    all_data = []
    num_pages_to_scrape = 184  # Adjust as needed

    try:
        for _ in range(num_pages_to_scrape):
            page_data = scrape_page_data(driver, full_url)
            all_data.extend(page_data)

            next_button = driver.find_elements(By.LINK_TEXT, "Next")
            if next_button:
                next_url = next_button[0].get_attribute("href")
                if next_url:
                    driver.get(next_url)
                    full_url = next_url
                else:
                    break
            else:
                break
    finally:
        driver.quit()

    # Clean and process the scraped data
    cleaned_data_array = clean_data(all_data)

    # Convert to pandas DataFrame
    df = convert_to_dataframe(cleaned_data_array)

    # Display the DataFrame
    print(df)

# Execute the main function
if __name__ == "__main__":
    main()


              Team ScoreDescending  Overs    RPO  Lead Inns Result     \
0        Sri Lanka          952/6d  271.0   3.51   415    2   draw      
1          England          903/7d  335.2   2.69   903    1    won      
2          England             849  258.2   3.28   849    1   draw      
3      West Indies          790/3d  208.1   3.79   462    2    won      
4         Pakistan          765/6d  248.5   3.07   121    2   draw      
...            ...             ...    ...    ...   ...  ...    ... ..   
9195     Australia            10/1    3.0   3.33     1    4    won      
9196     Australia            10/0    0.4  15.00     6    4    won      
9197  South Africa             9/0    1.1   7.71     1    4    won      
9198   New Zealand             9/0    1.4   5.40     1    4    won      
9199      Pakistan             8/0  0.6x8   8.00  -168    4   draw      

         Opposition         Ground   Start Date  
0            vIndia  Colombo (RPS)   2 Aug 1997  
1        vAustralia    