In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_table_data(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
    
        # Find the table element based on your website's structure
        table = soup.find_all('table')[0] # Adjust this based on the HTML structure

        if table:
            # Extract data from the table
            table_data = []
            rows = table.find_all('tr')

            for row in rows:
                # Extract data from each cell in the row
                cells = row.find_all(['td', 'th'])
                row_data = [cell.text.strip() for cell in cells]
                
                # Extract href from the last column
                last_cell = cells[-1]
                href_value = last_cell.find('a')['href'] if last_cell.find('a') else None
                row_data.append(href_value)
                
                table_data.append(row_data)

            # Create a DataFrame from the table data
            df = pd.DataFrame(table_data[1:], columns=table_data[0])
            df.rename(columns={df.columns[-1]: 'link'}, inplace=True)

            # Concatenate the values from 'Team 1' and 'Team 2' columns with ' vs ' and create a new column 'match'
            df.insert(2, 'match', df['Team 1'] + ' vs ' + df['Team 2'])

            # Concatenate the prefix to the values in the last column
            df['link'] = 'https://www.espncricinfo.com' + df['link']

            return df.dropna()  # Drop rows with None values
        else:
            print('Table not found on the page.')
            return None
    else:
        print(f'Error {response.status_code}: Failed to retrieve the page.')
        return None

# Example usage:
url = 'https://www.espncricinfo.com/records/tournament/team-match-results/icc-cricket-world-cup-2023-24-15338'  # Replace with the actual URL
result_df = scrape_table_data(url)

# Convert the DataFrame to JSON
if result_df is not None:
    result_df.to_json('match_info.json')
    print('match_info successfully created')

match_info successfully created


In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

def scrape_additional_text(url, selector):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the element based on the provided selector
        element = soup.select_one(selector)

        if element:
            # Extract text from the element
            text = element.get_text(strip=True)
            return text
        else:
            print(f'Element not found with selector: {selector}')
            return None
    else:
        print(f'Error {response.status_code}: Failed to retrieve the page.')
        return None



def scrape_table_data(url,table_index):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
    
        # Find the table element based on your website's structure
        table = soup.find_all('table')[table_index] # Adjust this based on the HTML structure

        if table:
            # Extract data from the table
            table_data = []
            rows = table.find_all('tr')

            for row in rows:
                # Extract data from each cell in the row
                cells = row.find_all(['td', 'th'])
                row_data = [cell.text.strip() for cell in cells]
                table_data.append(row_data)

            # Create a DataFrame from the table data
            df = pd.DataFrame(table_data[1:], columns=table_data[0])

            # Rename the column at index 1 to "bowler"
            df.rename(columns={df.columns[1]: 'bowler'}, inplace=True)

            return df.dropna()
            
        else:
            print('Table not found on the page.')
            return None
    else:
        print(f'Error {response.status_code}: Failed to retrieve the page.')
        return None
    

def process_urls_from_json(json_filename):
    # Read URLs and match data from the JSON file
    with open(json_filename, 'r') as json_file:
        data = pd.read_json(json_file, orient='records')

    # Process each URL
    output_list = []  # List to store dictionaries for each iteration
    for index, row in data.iterrows():
        url = row['link']
        match_value = row['match']

        selector = "#main-container > div.ds-relative > div.lg\:ds-container.lg\:ds-mx-auto.lg\:ds-px-5.lg\:ds-pt-4 > div > div.ds-flex.ds-space-x-5 > div.ds-grow > div.ds-mt-3 > div:nth-child(1) > div:nth-child(2) > div > div.ds-flex.ds-px-4.ds-border-b.ds-border-line.ds-py-3.ds-bg-ui-fill-translucent-hover > div > span > span.ds-text-title-xs.ds-font-bold.ds-capitalize"
        additional_text = scrape_additional_text(url, selector)
        result_df = scrape_table_data(url,0)

        # Add a new column named 'match' at the 1st position and populate it with match values
        if result_df is not None:
            result_df.insert(0, 'match', match_value)

            # Add a new column named 'BattingPos' at the 3rd position and insert numbering
            result_df.insert(2, 'BattingPos', range(1, len(result_df) + 1))

            result_df.insert(1, 'teamInnings', additional_text)

            # Convert DataFrame to list of dictionaries
            output_list.extend(result_df.dropna().to_dict(orient='records'))

    # Write the list of dictionaries to a JSON file
    with open('batting_info.json', 'w') as json_output_file:
        json.dump(output_list, json_output_file, indent=2)
    
    for index, row in data.iterrows():
        url = row['link']
        match_value = row['match']
        selector = "#main-container > div.ds-relative > div.lg\:ds-container.lg\:ds-mx-auto.lg\:ds-px-5.lg\:ds-pt-4 > div > div.ds-flex.ds-space-x-5 > div.ds-grow > div.ds-mt-3 > div:nth-child(1) > div:nth-child(3) > div > div.ds-flex.ds-px-4.ds-border-b.ds-border-line.ds-py-3.ds-bg-ui-fill-translucent-hover > div > span > span.ds-text-title-xs.ds-font-bold.ds-capitalize"
        additional_text = scrape_additional_text(url, selector)
        result_df = scrape_table_data(url,2)

        # Add a new column named 'match' at the 1st position and populate it with match values
        if result_df is not None:
            result_df.insert(0, 'match', match_value)

            # Add a new column named 'BattingPos' at the 3rd position and insert numbering
            result_df.insert(2, 'BattingPos', range(1, len(result_df) + 1))

            result_df.insert(1, 'teamInnings', additional_text)

            # Convert DataFrame to list of dictionaries
            output_list.extend(result_df.dropna().to_dict(orient='records'))

    # Write the list of dictionaries to a JSON file
    with open('batting_info.json', 'w') as json_output_file:
        json.dump(output_list, json_output_file, indent=2)
# Example usage:
json_filename = 'match_info.json'  # Replace with the actual JSON file containing URLs and match data
process_urls_from_json(json_filename)


  selector = "#main-container > div.ds-relative > div.lg\:ds-container.lg\:ds-mx-auto.lg\:ds-px-5.lg\:ds-pt-4 > div > div.ds-flex.ds-space-x-5 > div.ds-grow > div.ds-mt-3 > div:nth-child(1) > div:nth-child(2) > div > div.ds-flex.ds-px-4.ds-border-b.ds-border-line.ds-py-3.ds-bg-ui-fill-translucent-hover > div > span > span.ds-text-title-xs.ds-font-bold.ds-capitalize"
  selector = "#main-container > div.ds-relative > div.lg\:ds-container.lg\:ds-mx-auto.lg\:ds-px-5.lg\:ds-pt-4 > div > div.ds-flex.ds-space-x-5 > div.ds-grow > div.ds-mt-3 > div:nth-child(1) > div:nth-child(3) > div > div.ds-flex.ds-px-4.ds-border-b.ds-border-line.ds-py-3.ds-bg-ui-fill-translucent-hover > div > span > span.ds-text-title-xs.ds-font-bold.ds-capitalize"
  selector = "#main-container > div.ds-relative > div.lg\:ds-container.lg\:ds-mx-auto.lg\:ds-px-5.lg\:ds-pt-4 > div > div.ds-flex.ds-space-x-5 > div.ds-grow > div.ds-mt-3 > div:nth-child(1) > div:nth-child(2) > div > div.ds-flex.ds-px-4.ds-border-b.ds-border-lin

ConnectionError: HTTPSConnectionPool(host='www.espncricinfo.com', port=443): Max retries exceeded with url: /series/icc-cricket-world-cup-2023-24-1367856/netherlands-vs-south-africa-15th-match-1384406/full-scorecard (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002243D910E60>: Failed to resolve 'www.espncricinfo.com' ([Errno 11001] getaddrinfo failed)"))

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

def scrape_additional_text(url, selector):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the element based on the provided selector
        element = soup.select_one(selector)

        if element:
            # Extract text from the element
            text = element.get_text(strip=True)
            return text
        else:
            print(f'Element not found with selector: {selector}')
            return None
    else:
        print(f'Error {response.status_code}: Failed to retrieve the page.')
        return None



def scrape_table_data(url,table_index):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
    
        # Find the table element based on your website's structure
        table = soup.find_all('table')[table_index] # Adjust this based on the HTML structure

        if table:
            # Extract data from the table
            table_data = []
            rows = table.find_all('tr')

            for row in rows:
                # Extract data from each cell in the row
                cells = row.find_all(['td', 'th'])
                row_data = [cell.text.strip() for cell in cells]
                table_data.append(row_data)

            # Create a DataFrame from the table data
            df = pd.DataFrame(table_data[1:], columns=table_data[0])

            df.rename(columns={df.columns[0]: 'bowlerName'}, inplace=True)
            df.rename(columns={df.columns[1]: 'Overs'}, inplace=True)
            df.rename(columns={df.columns[2]: 'Maiden'}, inplace=True)
            df.rename(columns={df.columns[3]: 'Runs'}, inplace=True)
            df.rename(columns={df.columns[4]: 'Wickets'}, inplace=True)
            df.rename(columns={df.columns[5]: 'Economy'}, inplace=True)

            return df.dropna()
            
        else:
            print('Table not found on the page.')
            return None
    else:
        print(f'Error {response.status_code}: Failed to retrieve the page.')
        return None
    

def process_urls_from_json(json_filename):
    # Read URLs and match data from the JSON file
    with open(json_filename, 'r') as json_file:
        data = pd.read_json(json_file, orient='records')

    # Process each URL
    output_list = []  # List to store dictionaries for each iteration
    for index, row in data.iterrows():
        url = row['link']
        match_value = row['match']

        selector = "#main-container > div.ds-relative > div.lg\:ds-container.lg\:ds-mx-auto.lg\:ds-px-5.lg\:ds-pt-4 > div > div.ds-flex.ds-space-x-5 > div.ds-grow > div.ds-mt-3 > div:nth-child(1) > div:nth-child(2) > div > div.ds-flex.ds-px-4.ds-border-b.ds-border-line.ds-py-3.ds-bg-ui-fill-translucent-hover > div > span > span.ds-text-title-xs.ds-font-bold.ds-capitalize"
        additional_text = scrape_additional_text(url, selector)
        result_df = scrape_table_data(url,1)

        # Add a new column named 'match' at the 1st position and populate it with match values
        if result_df is not None:
            result_df.insert(0, 'match', match_value)

            # Add a new column named 'BattingPos' at the 3rd position and insert numbering

            result_df.insert(1, 'teamInnings', additional_text)

            # Convert DataFrame to list of dictionaries
            output_list.extend(result_df.dropna().to_dict(orient='records'))

    # Write the list of dictionaries to a JSON file
    with open('bowling_info.json', 'w') as json_output_file:
        json.dump(output_list, json_output_file, indent=2)
    
    for index, row in data.iterrows():
        url = row['link']
        match_value = row['match']
        selector = "#main-container > div.ds-relative > div.lg\:ds-container.lg\:ds-mx-auto.lg\:ds-px-5.lg\:ds-pt-4 > div > div.ds-flex.ds-space-x-5 > div.ds-grow > div.ds-mt-3 > div:nth-child(1) > div:nth-child(3) > div > div.ds-flex.ds-px-4.ds-border-b.ds-border-line.ds-py-3.ds-bg-ui-fill-translucent-hover > div > span > span.ds-text-title-xs.ds-font-bold.ds-capitalize"
        additional_text = scrape_additional_text(url, selector)
        result_df = scrape_table_data(url,3)

        # Add a new column named 'match' at the 1st position and populate it with match values
        if result_df is not None:
            result_df.insert(0, 'match', match_value)

            # Add a new column named 'BattingPos' at the 3rd position and insert numbering

            result_df.insert(1, 'teamInnings', additional_text)

            # Convert DataFrame to list of dictionaries
            output_list.extend(result_df.dropna().to_dict(orient='records'))

    # Write the list of dictionaries to a JSON file
    with open('bowling_info.json', 'w') as json_output_file:
        json.dump(output_list, json_output_file, indent=2)
# Example usage:
json_filename = 'match_info.json'  # Replace with the actual JSON file containing URLs and match data
process_urls_from_json(json_filename)


  selector = "#main-container > div.ds-relative > div.lg\:ds-container.lg\:ds-mx-auto.lg\:ds-px-5.lg\:ds-pt-4 > div > div.ds-flex.ds-space-x-5 > div.ds-grow > div.ds-mt-3 > div:nth-child(1) > div:nth-child(2) > div > div.ds-flex.ds-px-4.ds-border-b.ds-border-line.ds-py-3.ds-bg-ui-fill-translucent-hover > div > span > span.ds-text-title-xs.ds-font-bold.ds-capitalize"
  selector = "#main-container > div.ds-relative > div.lg\:ds-container.lg\:ds-mx-auto.lg\:ds-px-5.lg\:ds-pt-4 > div > div.ds-flex.ds-space-x-5 > div.ds-grow > div.ds-mt-3 > div:nth-child(1) > div:nth-child(3) > div > div.ds-flex.ds-px-4.ds-border-b.ds-border-line.ds-py-3.ds-bg-ui-fill-translucent-hover > div > span > span.ds-text-title-xs.ds-font-bold.ds-capitalize"
  selector = "#main-container > div.ds-relative > div.lg\:ds-container.lg\:ds-mx-auto.lg\:ds-px-5.lg\:ds-pt-4 > div > div.ds-flex.ds-space-x-5 > div.ds-grow > div.ds-mt-3 > div:nth-child(1) > div:nth-child(2) > div > div.ds-flex.ds-px-4.ds-border-b.ds-border-lin

ConnectionError: HTTPSConnectionPool(host='www.espncricinfo.com', port=443): Max retries exceeded with url: /series/icc-cricket-world-cup-2023-24-1367856/india-vs-australia-final-1384439/full-scorecard (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002243CB29CA0>: Failed to resolve 'www.espncricinfo.com' ([Errno 11001] getaddrinfo failed)"))

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

def scrape_additional_text(url, selector):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the element based on the provided selector
        element = soup.select_one(selector)

        if element:
            # Extract text from the element
            text = element.get_text(strip=True)
            return text
        else:
            print(f'Element not found with selector: {selector}')
            return None
    else:
        print(f'Error {response.status_code}: Failed to retrieve the page.')
        return None



def scrape_table_data(url,table_index):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
    
        # Find the table element based on your website's structure
        table = soup.find_all('table')[table_index] # Adjust this based on the HTML structure

        if table:
            # Extract data from the table
            table_data = []
            rows = table.find_all('tr')

            for row in rows:
                # Extract data from each cell in the row
                cells = row.find_all(['td', 'th'])
                row_data = [cell.text.strip() for cell in cells]
                table_data.append(row_data)

            # Create a DataFrame from the table data
            df = pd.DataFrame(table_data[1:], columns=table_data[0])

            df.rename(columns={df.columns[0]: 'bowlerName'}, inplace=True)
            df.rename(columns={df.columns[1]: 'Overs'}, inplace=True)
            df.rename(columns={df.columns[2]: 'Maiden'}, inplace=True)
            df.rename(columns={df.columns[3]: 'Runs'}, inplace=True)
            df.rename(columns={df.columns[4]: 'Wickets'}, inplace=True)
            df.rename(columns={df.columns[5]: 'Economy'}, inplace=True)

            return df.dropna()
            
        else:
            print('Table not found on the page.')
            return None
    else:
        print(f'Error {response.status_code}: Failed to retrieve the page.')
        return None
    

def process_urls_from_json(json_filename):
    # Read URLs and match data from the JSON file
    with open(json_filename, 'r') as json_file:
        data = pd.read_json(json_file, orient='records')

    # Process each URL
    output_list = []  # List to store dictionaries for each iteration
    for index, row in data.iterrows():
        url = row['link']
        match_value = row['match']

        selector = "#main-container > div.ds-relative > div.lg\:ds-container.lg\:ds-mx-auto.lg\:ds-px-5.lg\:ds-pt-4 > div > div.ds-flex.ds-space-x-5 > div.ds-grow > div.ds-mt-3 > div:nth-child(1) > div:nth-child(2) > div > div.ds-flex.ds-px-4.ds-border-b.ds-border-line.ds-py-3.ds-bg-ui-fill-translucent-hover > div > span > span.ds-text-title-xs.ds-font-bold.ds-capitalize"
        additional_text = scrape_additional_text(url, selector)
        result_df = scrape_table_data(url,1)

        # Add a new column named 'match' at the 1st position and populate it with match values
        if result_df is not None:
            result_df.insert(0, 'match', match_value)

            # Add a new column named 'BattingPos' at the 3rd position and insert numbering

            result_df.insert(1, 'teamInnings', additional_text)

            # Convert DataFrame to list of dictionaries
            output_list.extend(result_df.dropna().to_dict(orient='records'))

    # Write the list of dictionaries to a JSON file
    with open('bowling_info.json', 'w') as json_output_file:
        json.dump(output_list, json_output_file, indent=2)
    
    for index, row in data.iterrows():
        url = row['link']
        match_value = row['match']
        selector = "#main-container > div.ds-relative > div.lg\:ds-container.lg\:ds-mx-auto.lg\:ds-px-5.lg\:ds-pt-4 > div > div.ds-flex.ds-space-x-5 > div.ds-grow > div.ds-mt-3 > div:nth-child(1) > div:nth-child(3) > div > div.ds-flex.ds-px-4.ds-border-b.ds-border-line.ds-py-3.ds-bg-ui-fill-translucent-hover > div > span > span.ds-text-title-xs.ds-font-bold.ds-capitalize"
        additional_text = scrape_additional_text(url, selector)
        result_df = scrape_table_data(url,3)

        # Add a new column named 'match' at the 1st position and populate it with match values
        if result_df is not None:
            result_df.insert(0, 'match', match_value)

            # Add a new column named 'BattingPos' at the 3rd position and insert numbering

            result_df.insert(1, 'teamInnings', additional_text)

            # Convert DataFrame to list of dictionaries
            output_list.extend(result_df.dropna().to_dict(orient='records'))

    # Write the list of dictionaries to a JSON file
    with open('bowling_info.json', 'w') as json_output_file:
        json.dump(output_list, json_output_file, indent=2)
# Example usage:
json_filename = 'match_info.json'  # Replace with the actual JSON file containing URLs and match data
process_urls_from_json(json_filename)


  selector = "#main-container > div.ds-relative > div.lg\:ds-container.lg\:ds-mx-auto.lg\:ds-px-5.lg\:ds-pt-4 > div > div.ds-flex.ds-space-x-5 > div.ds-grow > div.ds-mt-3 > div:nth-child(1) > div:nth-child(2) > div > div.ds-flex.ds-px-4.ds-border-b.ds-border-line.ds-py-3.ds-bg-ui-fill-translucent-hover > div > span > span.ds-text-title-xs.ds-font-bold.ds-capitalize"
  selector = "#main-container > div.ds-relative > div.lg\:ds-container.lg\:ds-mx-auto.lg\:ds-px-5.lg\:ds-pt-4 > div > div.ds-flex.ds-space-x-5 > div.ds-grow > div.ds-mt-3 > div:nth-child(1) > div:nth-child(3) > div > div.ds-flex.ds-px-4.ds-border-b.ds-border-line.ds-py-3.ds-bg-ui-fill-translucent-hover > div > span > span.ds-text-title-xs.ds-font-bold.ds-capitalize"
