In [0]:
import subprocess
import requests
from bs4 import BeautifulSoup
import re
import os
from urllib.parse import urljoin

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DataIngestionPipeline") \
    .getOrCreate()

In [0]:
# Configure AWS credentials (if not using IAM roles)
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", "your-access-key")
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "your-secret-key")
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")

In [0]:
def get_file_names_and_links():
    data_url = "https://cricsheet.org/downloads/"
    response = requests.get(data_url)
    # display(response.text)
    if response.status_code == 200:
        # Parse the content with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all the <a> tags within <li> tags
        ul_elements = soup.find_all('ul', class_='formatlinks')  
        filtered_links = []

        for ul in ul_elements:
            a_tags = ul.find_all('a', href=True)

            for a_tag in a_tags:
                href = a_tag['href']
                if "male" not in href and "json" in href:
                    filtered_links.append(href)

        td_elements = soup.find_all('td', class_='name', attrs={'rowspan': False})

        for td in td_elements:
            # Remove <span> with class 'withheld'
            withheld_span = td.find('span', class_='withheld')
            if withheld_span:
                withheld_span.decompose()


        print(f"Number of filtered links: {len(filtered_links)}")
        print(f"Number of <td> elements: {len(td_elements)}")
        print(f"--------------------------------")
        
        # Prepare the data for DataFrame
        data_url2 = "https://cricsheet.org/"
        filtered_links = [urljoin(data_url2, link) for link in filtered_links]
        td_elements = [td.text.strip() for td in td_elements]
        data = list(zip(td_elements, filtered_links))

        # Create the DataFrame
        columns = ["Event/Country Name", "Links"]
        file_names_df = spark.createDataFrame(data, columns)
        display(file_names_df)
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


In [0]:
get_file_names_and_links()

Number of filtered links: 434
Number of <td> elements: 434
--------------------------------


Event/Country Name,Links
Added in the previous 2 days,https://cricsheet.org/downloads/recently_added_2_json.zip
Added in the previous 7 days,https://cricsheet.org/downloads/recently_added_7_json.zip
Added in the previous 30 days,https://cricsheet.org/downloads/recently_added_30_json.zip
Played in the previous 2 days,https://cricsheet.org/downloads/recently_played_2_json.zip
Played in the previous 7 days,https://cricsheet.org/downloads/recently_played_7_json.zip
Played in the previous 30 days,https://cricsheet.org/downloads/recently_played_30_json.zip
All matches,https://cricsheet.org/downloads/all_json.zip
Test matches,https://cricsheet.org/downloads/tests_json.zip
Multi-day matches,https://cricsheet.org/downloads/mdms_json.zip
One-day internationals,https://cricsheet.org/downloads/odis_json.zip
