## Imports

In [2]:
import os
from bs4 import BeautifulSoup
import csv
import re

## Finding prioritized entry

In [6]:
def find_prioritized_entry(entries):
    # Priority comments
    priority_comments = [
        "AA; Typical",
        "Average of tension and compression",
        "Typical;",
        "Typical",
        "AA2024;",
        "AA2024",
        "AA;"
    ]
    
    # Check each entry for prioritized comments
    for entry in entries:
        if any(comment in entry["comment"] for comment in priority_comments):
            return entry  # Return the first matching prioritized entry
    return entries[0] if entries else None  # Default to the first entry if no prioritized one found


## Density

In [7]:
def extract_density(soup):
    row_patterns = [
        r"Density"
    ]
    
    pattern_regex = re.compile('|'.join(row_patterns), re.IGNORECASE)

    density_data = []
    capturing = False  # Flag to control when to start and stop capturing data
    last_valid_property_name = None  # To keep track of the last matched property name

    # Iterate through all table rows in the document
    rows = soup.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if not cells:
            continue
        property_name = cells[0].get_text(strip=True) if len(cells) > 0 else ""
        # Start or stop capturing based on the property name
        if pattern_regex.search(property_name):
            capturing = True
            last_valid_property_name = property_name  # Update the last valid property name
        elif property_name.strip() == "" and capturing:
            # Continue capturing if property name is temporarily empty
            property_name = last_valid_property_name
        elif property_name.strip() != "" and not pattern_regex.search(property_name):
            # Stop capturing if a new, non-empty, non-matching property name is encountered
            capturing = False

        # Capture data while the capturing flag is set
        if capturing and len(cells) >= 3:
            metric_value = cells[1].get_text(strip=True)
            comment = cells[3].get_text(strip=True) if len(cells) > 3 else ""

            # Extract numeric values and units
            metric_match = re.search(r"(\d+\.?\d*)\s*(g/cc)", metric_value)
            if metric_match:
                metric_value, metric_unit = metric_match.groups()

                entry = {
                    'metric_value': metric_value,
                    'metric_unit': metric_unit,
                    'comment': comment
                }
                density_data.append(entry)
    
    
    return density_data[0] if density_data else None

## Youngs Modulus

In [8]:
def extract_youngs_modulus(soup):
    # Patterns to start capturing data
    row_patterns = [
        r"Tensile\s+Modulus", 
        r"Modulus\s+of\s+Elasticity",
        r"Elastic\s+Modulus"
    ]
    
    pattern_regex = re.compile('|'.join(row_patterns), re.IGNORECASE)

    youngs_modulus_data = []
    capturing = False  # Flag to control when to start and stop capturing data
    last_valid_property_name = None  # To keep track of the last matched property name

    # Iterate through all table rows in the document
    rows = soup.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if not cells:
            continue
        property_name = cells[0].get_text(strip=True) if len(cells) > 0 else ""

        # Start or stop capturing based on the property name
        if pattern_regex.search(property_name):
            capturing = True
            last_valid_property_name = property_name  # Update the last valid property name
        elif property_name.strip() == "" and capturing:
            # Continue capturing if property name is temporarily empty
            property_name = last_valid_property_name
        elif property_name.strip() != "" and not pattern_regex.search(property_name):
            # Stop capturing if a new, non-empty, non-matching property name is encountered
            capturing = False

        # Capture data while the capturing flag is set
        if capturing and len(cells) >= 3:
            metric_value = cells[1].get_text(strip=True)
            comment = cells[3].get_text(strip=True) if len(cells) > 3 else ""

            # Extract numeric values and units
            metric_match = re.search(r"(\d+\.?\d*)\s*(\w+)", metric_value)
            if metric_match:
                metric_value, metric_unit = metric_match.groups()

                entry = {
                    'metric_value': metric_value,
                    'metric_unit': metric_unit,
                    'comment': comment
                }
                youngs_modulus_data.append(entry)
    
    best = find_prioritized_entry(youngs_modulus_data)
    return best if youngs_modulus_data else None

## Yield Strength

In [9]:
def extract_yield_strength(soup):
    row_patterns = [
        r"Tensile\s+Strength,\s*Yield",
        r"Tensile\s+Strength,\s*Yield\s*",
        r"Tensile\s+Strength\s*Yield",
        r"Yield\s+Strength", 
        r"0\.2%\s*Proof\s*Stress",
        r"0\.2%\s*Yield\s*Strength",
        r"Proof\s+Strength",
        r"Lower\s+Yield\s+Strength"
    ]
    
    pattern_regex = re.compile('|'.join(row_patterns), re.IGNORECASE)

    yield_strength_data = []
    capturing = False  # Flag to control when to start and stop capturing data
    last_valid_property_name = None  # To keep track of the last matched property name

    # Iterate through all table rows in the document
    rows = soup.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if not cells:
            continue
        property_name = cells[0].get_text(strip=True) if len(cells) > 0 else ""

        # Start or stop capturing based on the property name
        if pattern_regex.search(property_name):
            capturing = True
            last_valid_property_name = property_name  # Update the last valid property name
        elif property_name.strip() == "" and capturing:
            # Continue capturing if property name is temporarily empty
            property_name = last_valid_property_name
        elif property_name.strip() != "" and not pattern_regex.search(property_name):
            # Stop capturing if a new, non-empty, non-matching property name is encountered
            capturing = False

        # Capture data while the capturing flag is set
        if capturing and len(cells) >= 3:
            metric_value = cells[1].get_text(strip=True)
            comment = cells[3].get_text(strip=True) if len(cells) > 3 else ""

            # Extract numeric values and units
            metric_match = re.search(r"(\d+\.?\d*)\s*(\w+)", metric_value)
            if metric_match:
                metric_value, metric_unit = metric_match.groups()

                entry = {
                    'metric_value': metric_value,
                    'metric_unit': metric_unit,
                    'comment': comment
                }
                yield_strength_data.append(entry)
    
    best = find_prioritized_entry(yield_strength_data)
    return best if yield_strength_data else None

## Tensile Strength

In [10]:
def extract_tensile_strength(soup):
    row_patterns = [
        r"Tensile\s+Strength,\s*Ultimate",
        r"Ultimate\s+Tensile\s+Strength",
        r"UTS"
    ]
    
    pattern_regex = re.compile('|'.join(row_patterns), re.IGNORECASE)

    tensile_strength_data = []
    capturing = False  # Flag to control when to start and stop capturing data
    last_valid_property_name = None  # To keep track of the last matched property name

    # Iterate through all table rows in the document
    rows = soup.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if not cells:
            continue
        property_name = cells[0].get_text(strip=True) if len(cells) > 0 else ""

        # Start or stop capturing based on the property name
        if pattern_regex.search(property_name):
            capturing = True
            last_valid_property_name = property_name  # Update the last valid property name
        elif property_name.strip() == "" and capturing:
            # Continue capturing if property name is temporarily empty
            property_name = last_valid_property_name
        elif property_name.strip() != "" and not pattern_regex.search(property_name):
            # Stop capturing if a new, non-empty, non-matching property name is encountered
            capturing = False

        # Capture data while the capturing flag is set
        if capturing and len(cells) >= 3:
            metric_value = cells[1].get_text(strip=True)
            comment = cells[3].get_text(strip=True) if len(cells) > 3 else ""

            # Extract numeric values and units
            metric_match = re.search(r"(\d+\.?\d*)\s*(\w+)", metric_value)
            if metric_match:
                metric_value, metric_unit = metric_match.groups()

                entry = {
                    'metric_value': metric_value,
                    'metric_unit': metric_unit,
                    'comment': comment
                }
                tensile_strength_data.append(entry)
    
    best = find_prioritized_entry(tensile_strength_data)
    return best if tensile_strength_data else None

## Fatigue Strength

In [11]:
def extract_fatigue_strength(soup):
    # Patterns to start capturing data
    row_patterns = [
        r"Fatigue\s+Strength",
        r"Endurance\s+Limit",
        r"Fatigue\s+Limit",
        r"Endurance\s+Strength"
    ]
    
    pattern_regex = re.compile('|'.join(row_patterns), re.IGNORECASE)

    fatigue_strength_data = []
    capturing = False  # Flag to control when to start and stop capturing data
    last_valid_property_name = None  # To keep track of the last matched property name

    # Iterate through all table rows in the document
    rows = soup.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if not cells:
            continue
        property_name = cells[0].get_text(strip=True) if len(cells) > 0 else ""

        # Start or stop capturing based on the property name
        if pattern_regex.search(property_name):
            capturing = True
            last_valid_property_name = property_name  # Update the last valid property name
        elif property_name.strip() == "" and capturing:
            # Continue capturing if property name is temporarily empty
            property_name = last_valid_property_name
        elif property_name.strip() != "" and not pattern_regex.search(property_name):
            # Stop capturing if a new, non-empty, non-matching property name is encountered
            capturing = False

        # Capture data while the capturing flag is set
        if capturing and len(cells) >= 3:
            metric_value = cells[1].get_text(strip=True)
            comment = cells[3].get_text(strip=True) if len(cells) > 3 else ""

            # Extract numeric values and units
            metric_match = re.search(r"(\d+\.?\d*)\s*(\w+)", metric_value)
            if metric_match:
                metric_value, metric_unit = metric_match.groups()

                entry = {
                    'metric_value': metric_value,
                    'metric_unit': metric_unit,
                    'comment': comment
                }
                fatigue_strength_data.append(entry)
    
    best = find_prioritized_entry(fatigue_strength_data)
    return best if fatigue_strength_data else None

## Specific Heat

In [12]:
def extract_specific_heat(soup):
    row_patterns = [
        r"Specific\s+Heat\s+Capacity",
        r"Specific\s+Heat"
    ]
    
    pattern_regex = re.compile('|'.join(row_patterns), re.IGNORECASE)

    specific_heat_data = []
    capturing = False  # Flag to control when to start and stop capturing data
    last_valid_property_name = None  # To keep track of the last matched property name

    # Iterate through all table rows in the document
    rows = soup.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if not cells:
            continue
        property_name = cells[0].get_text(strip=True) if len(cells) > 0 else ""
        # Start or stop capturing based on the property name
        if pattern_regex.search(property_name):
            capturing = True
            last_valid_property_name = property_name  # Update the last valid property name
        elif property_name.strip() == "" and capturing:
            # Continue capturing if property name is temporarily empty
            property_name = last_valid_property_name
        elif property_name.strip() != "" and not pattern_regex.search(property_name):
            # Stop capturing if a new, non-empty, non-matching property name is encountered
            capturing = False

        # Capture data while the capturing flag is set
        if capturing and len(cells) >= 3:
            metric_value = cells[1].get_text(strip=True)
            comment = cells[3].get_text(strip=True) if len(cells) > 3 else ""

            # Extract numeric values and units
            metric_match = re.search(r"(\d+\.\d*)\s*(.+)", metric_value)
            if metric_match:
                metric_value, metric_unit = metric_match.groups()

                entry = {
                    'metric_value': metric_value,
                    'metric_unit': metric_unit,
                    'comment': comment
                }
                specific_heat_data.append(entry)
    
    
    return specific_heat_data[0] if specific_heat_data else None

## CTE Extraction

In [13]:
def extract_cte_values(soup):
    
    header_patterns = [
    r"CTE",
    r"CTE,\s*linear",
    r"Coefficient\s+of\s+Thermal\s+Expansion",
    r"Thermal\s+Expansion\s+Coefficent",
    r"Linear\s+Thermal\s+Expansion"
    ]
    
    headers = soup.find_all('tr')
    cte_data = []
    prioritized_cte = None

    # Regex to extract the CTE value and temperature range
    cte_regex = re.compile(r"(\d+\.?\d*)\s*µm/m-°C\s*@Temperature\s*(-?\d+\.?\d*)\s*-\s*(-?\d+\.?\d*)\s*°C")
    
    for header in headers:
        if any(re.search(pattern, header.get_text(strip=True), re.IGNORECASE) for pattern in header_patterns):
            parent_table = header.find_parent('table')
            if parent_table:
                rows = parent_table.find_all('tr')
                for row in rows:
                    cells = row.find_all('td')
                    if cells:
                        cell_text = ' '.join(cell.get_text(strip=True) for cell in cells)
                        match = cte_regex.search(cell_text)
                        if match:
                            value, start_temp, end_temp = match.groups()
                            cte_entry = {
                                "value": value + " µm/m-°C",
                                "start_temp": start_temp + " °C",
                                "end_temp": end_temp + " °C"
                            }
                            cte_data.append(cte_entry)
                            # Check if this entry matches the most desired temperature range
                            if (start_temp == "20.0" or start_temp == "20")  and (end_temp == "100.0" or end_temp == "100"):
                                prioritized_cte = cte_entry

    # Select the best CTE value based on given conditions
    if prioritized_cte:
        return [prioritized_cte]  # Return the most prioritized CTE data
    else:
        # If no specific 20-100 range, look for any entry containing 20 as start or end
        for cte in cte_data:
            if cte["start_temp"] == "20.0 °C" or cte["end_temp"] == "20.0 °C":
                return [cte]  # Return the first matching 20°C entry
        # If none found, return the first available CTE value if there is any
        return [cte_data[0]] if cte_data else None

## Element Extraction

In [14]:
def extract_element_properties(soup): 
    # Find the <th> that contains "Component Elements Properties"
    comp_elements_th = soup.find('th', string='Component Elements Properties')
    
    # Initialize a list to hold all the extracted rows' data
    extracted_rows = []
    
    if comp_elements_th:
        # Find the parent <table> of the identified <th> tag
        parent_table = comp_elements_th.find_parent('table')
        if parent_table:
            # Get all the <tr> elements in the table
            all_trs = parent_table.find_all('tr')
            # Find the index of the <tr> that contains the <th> of interest
            start_index = None
            for index, tr in enumerate(all_trs):
                if comp_elements_th in tr:
                    start_index = index + 1
                    break
            # Extract all <tr> elements after the identified <th>
            if start_index is not None:
                for tr in all_trs[start_index:]:
                    row_data = [td.get_text(strip=True) for td in tr.find_all('td')]
                    extracted_rows.append(row_data)

    return extracted_rows

## Running

In [15]:
# Function to parse a single HTML file and extract required properties
def parse_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
    # Initialize extracted properties with default values
    extracted_properties = {
        "Alloy Name": soup.title.text.strip(),  # Extract alloy name from the title tag
        "Density": extract_density(soup),
        "Young's Modulus": extract_youngs_modulus(soup),
        "Yield Strength": extract_yield_strength(soup),
        "Tensile Strength": extract_tensile_strength(soup),
        "Fatigue Strength": extract_fatigue_strength(soup),
        "Specific Heat": extract_specific_heat(soup),
        "Coefficient of Thermal Expansion": extract_cte_values(soup),
        "Metal Chemical Notations": extract_element_properties(soup)
    }

    return extracted_properties

### Al

In [None]:
# Your existing code to process HTML files in a directory and write to CSV
directory_path = './data/pagesAl'
html_files = [f for f in os.listdir(directory_path) if f.endswith('.html')]
#html_files=['page62.html']

csv_file_path = 'extracted_Al_properties.csv'

with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["Alloy Name", "Density", "Young's Modulus", "Yield Strength", "Tensile Strength", "Fatigue Strength", "Specific Heat", "Coefficient of Thermal Expansion", "Metal Chemical Notations"])
    writer.writeheader()
    for html_file in html_files:
        file_path = os.path.join(directory_path, html_file)
        properties = parse_html(file_path)
        writer.writerow(properties)

print(f"Data extracted and written to {csv_file_path}.")

### Ni

In [16]:
# Your existing code to process HTML files in a directory and write to CSV
directory_path = './data/pagesNi'
html_files = [f for f in os.listdir(directory_path) if f.endswith('.html')]
#html_files=['page62.html']

csv_file_path = 'extracted_Ni_properties.csv'

with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["Alloy Name", "Density", "Young's Modulus", "Yield Strength", "Tensile Strength", "Fatigue Strength", "Specific Heat", "Coefficient of Thermal Expansion", "Metal Chemical Notations"])
    writer.writeheader()
    for html_file in html_files:
        file_path = os.path.join(directory_path, html_file)
        properties = parse_html(file_path)
        writer.writerow(properties)

print(f"Data extracted and written to {csv_file_path}.")

Data extracted and written to extracted_Ni_properties.csv.


### Ti

In [17]:
# Your existing code to process HTML files in a directory and write to CSV
directory_path = './data/pagesTi'
html_files = [f for f in os.listdir(directory_path) if f.endswith('.html')]
#html_files=['page62.html']

csv_file_path = 'extracted_Ti_properties.csv'

with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["Alloy Name", "Density", "Young's Modulus", "Yield Strength", "Tensile Strength", "Fatigue Strength", "Specific Heat", "Coefficient of Thermal Expansion", "Metal Chemical Notations"])
    writer.writeheader()
    for html_file in html_files:
        file_path = os.path.join(directory_path, html_file)
        properties = parse_html(file_path)
        writer.writerow(properties)

print(f"Data extracted and written to {csv_file_path}.")

Data extracted and written to extracted_Ti_properties.csv.
