In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin

BASE_URL = "https://www-archive.mozilla.org"
PAGE_URL_TEMPLATE = "https://www-archive.mozilla.org/quality/browser/front-end/testcases/"

def scrape_test_page(page_url, req_counter, test_counter, path):

    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, "html.parser")

    requirements = []
    tests = []
    mappings = []

    def extract_section(soup, section_name):
        """Extracts text content of the section with <h2>section_name</h2>"""
        h2 = soup.find("h2", string=lambda text: text and section_name.lower() in text.lower())
        content_text = ""
        
        if h2:
            section_title = h2.get_text(strip=True)
            next_elem = h2.find_next_sibling()
            parts = []
            while next_elem and not (next_elem.name == 'h2'):
                parts.append(next_elem.get_text(separator=' ', strip=True))
                next_elem = next_elem.find_next_sibling()

            full_section = [section_title] + parts
            content_text = "\n".join(full_section).strip()
                    
        return content_text

    def build_full_link(relative_link):
        current_page_url = urljoin(PAGE_URL_TEMPLATE, path)
        if not current_page_url.endswith("/"):
            current_page_url += "/"

        # Count slashes to determine if relative_link is a short name
        if relative_link:
            if relative_link.count('/') <= 1:  
                # E.g. relative_link is "Streamlined%20interface.htm", then build full path
                return urljoin(current_page_url, relative_link)
            else:
                # E.g. relative_link is /quality/browser/front-end/testcases/help/tssignon
                # then just concatenate is to the base url
                return urljoin(BASE_URL, relative_link)
        else:
            return ""


    for row in soup.find_all("tr"):
        tds = row.find_all("td")
        parsed_link = ""

        # For  "drag-drop", "form-manager" page layouts
        if len(tds) == 2:
            feature_td = tds[0]
            description = tds[1].get_text(strip=True)
            link_tag = feature_td.find("a")
            feature_parts = []
            for content in feature_td.contents:
                if isinstance(content, str):
                    feature_parts.append(content.strip())
                elif content.name == 'a':
                    feature_parts.append(content.get_text(strip=True))
            feature = " ".join(feature_parts).strip()
        # For  "oji", "password-manager", "printing", "plugins", "xpapps-gui", "themes" page layouts
        elif len(tds) == 3: 
            feature = tds[0].get_text(strip=True)
            description = tds[1].get_text(strip=True)
            link_tag = tds[2].find("a")
        # For all other page layouts
        elif len(tds) == 4:
            feature = tds[1].get_text(strip=True)
            description = tds[2].get_text(strip=True)
            link_tag = tds[3].find("a")
        elif len(tds) < 2:
            # Skip rows that are not 2, 3 or 4 columns
            continue
        
        relative_link = link_tag['href'] if link_tag else None
        full_link = build_full_link(relative_link)
        # Save Requirement
        req_id = f"R-{req_counter}"
        requirements.append({
            "ID": req_id,
            "Feature": feature,
            "Description": description
        })

        purpose_text = ""
        steps_text = "" 
        test_steps_combined= ""
        
        if full_link:   
            test_response = requests.get(full_link)
            test_soup = BeautifulSoup(test_response.content, "html.parser")

            purpose_text = extract_section(test_soup, "Purpose")
            initial_conditions = extract_section(test_soup, "Initial Conditions")
            steps_text = extract_section(test_soup, "Steps/Description")
            # Some sections are named "Description" instead of "Steps/Description"    
            desc_text = steps_text if steps_text else extract_section(test_soup, "Description")
            expected_results = extract_section(test_soup, "Expected Results")


            test_steps_combined = "\n\n".join(filter(None, [
                initial_conditions,
                desc_text,
                expected_results
            ]))

        # Save Test
        test_id = f"T-{req_counter}"
        tests.append({
            "ID": test_id,
            "Purpose": purpose_text,
            "TestSteps": test_steps_combined
        })

        # Save Mapping
        mappings.append({
            "ReqID": req_id,
            "TestID": test_id
        })

        req_counter += 1
        test_counter += 1

    return requirements, tests, mappings, req_counter, test_counter

In [48]:

all_requirements = []
all_tests = []
all_mappings = []

TARGET_PATHS = [
    "bookmarks",
    "copy-paste",
    "drag-drop",
    "form-manager",
    "help",
    "history",
    "imaging",
    "oji",
    "password-manager",
    "printing",
    "plugins",
    "search",
    "selection",
    "sidebar",
    "xpapps-gui",
    "themes",
    "toolbars"
]

req_counter = 1
test_counter = 1

for path in TARGET_PATHS:
    page_url = urljoin(PAGE_URL_TEMPLATE, path)
    reqs, tests, maps, req_counter, test_counter = scrape_test_page(page_url, req_counter, test_counter, path)
    
    all_requirements.extend(reqs)
    all_tests.extend(tests)
    all_mappings.extend(maps)

In [49]:

# Write to CSV files
with open('data/Mozilla2/RE.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['ID', 'Feature', 'Description'])
    writer.writeheader()
    writer.writerows(all_requirements)

with open('data/Mozilla2/ST.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['ID', 'Purpose', 'TestSteps'])
    writer.writeheader()
    writer.writerows(all_tests)

with open('data/Mozilla2/mapping.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['ReqID', 'TestID'])
    writer.writeheader()
    writer.writerows(all_mappings)

print("Files saved: requirements.csv, tests.csv, mapping.csv")

Files saved: requirements.csv, tests.csv, mapping.csv
