## Convert Irradiation PDF to TXT and based on values to calculate Hp
---
It will be similar to half-life search, but I don't 

In [1]:
import numpy as np
import pdfplumber
import os
import re
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Font, PatternFill, Border, Alignment

### Specific for Otto_2016.pdf

#### Here is the code for Otto_2016.pdf. Convert Tabel 2 to 4 txt files.
* (Change it to "Code", and run it.)
* Includes 4 information:
  1. Hp10_100cm_shield;
  2. Hp10_100cm_unshield
  3. Hp07_10cm_shield
  4. Hp07_10cm_unshield

### Specific for RPO 814.501

#### Mini Code (process_nuclide) to edit "32-Clm" to '32',"Cl_isomer"

In [33]:
# Function to split nuclide and handle isomers
def process_nuclide(nuclide):
    # Check if it's an isomer
    if 'm' in nuclide:
        parts = re.split(r'[-m]', nuclide)  # Split by hyphen or 'm'
        number = parts[1]
        letter = parts[0] + "_isomer"
    else:
        parts = nuclide.split('-')
        number = parts[1]
        letter = parts[0]
    return number, letter

#### Mini Code (combine_scientific_notation)

In [34]:
# Function to combine number and scientific notation
def combine_scientific_notation(columns):
    combined = []
    skip = False
    for i in range(len(columns) - 1):
        if skip:
            skip = False
            continue
        # Check if the current element is a number and the next element is in scientific notation
        if re.match(r'^\d+(\.\d+)?$', columns[i]) and re.match(r'^E[+-]\d+$', columns[i + 1]):
            combined.append(f"{columns[i]}e{columns[i + 1][1:]}")
            skip = True  # Skip the next element because we've combined it
        else:
            combined.append(columns[i])
    if not skip:
        combined.append(columns[-1])  # Append the last element if it wasn't part of a pair
    return combined

#### Here is the code for PRO 814.501 Tabel "Data for Operational Radiation Protection"
* Only extract e_ing and convert it unit to uSv/GBq

In [54]:
# Initialize a list to hold the data
table_data = []
with pdfplumber.open('/Users/yiyizhu/Packages/DoesCal/example/DoesPar/814.501.2014.pdf') as pdf:
    for page_num in range(64, 94): 
        page = pdf.pages[page_num]
        text = page.extract_text()
        lines = text.split('\n')



        # Loop through the lines to find relevant rows
        for line in lines:
            # Split the line by spaces to separate columns
            columns = line.split()

            if columns and re.match(r'^[A-Za-z]+-\d+', columns[0]):
                # Basic check to ensure there are enough columns
                if len(columns) > 5:
                    nuclide = columns[0]  # Nuclide
                    if '/' in nuclide:
                        isotopes = nuclide.split('/')
                        for isotope in isotopes:
                            number, letter = process_nuclide(isotope)
                            # Add the rest of the data
                            new_row = [number, letter] + combine_scientific_notation(columns[1:10])
                            table_data.append(new_row)
                    else:
                        number, letter = process_nuclide(nuclide)
                        if columns[1].isalpha():
                            letter += f"_{columns[1]}"
                            new_row = [number, letter] + combine_scientific_notation(columns[2:11])
                        else:
                            new_row = [number, letter] + combine_scientific_notation(columns[1:11])
                        table_data.append(new_row)

# Now filter the rows based on your conditions
final_data = [["# From RBO 814.501.2014"],
              ["#", "unit:", "uSv/h/GBq"],
              ["#A", "isotope", "e_ing"]]

for row in table_data:
    # Find the index of the unicode marker (example '\uf062\uf02d\uf020' or other)
    unicode_index = -1
    for i, val in enumerate(row):
        if repr(val).startswith("'\\u"):  # Check if the element is a Unicode character
            unicode_index = i
        
    # Proceed if a Unicode element was found
    if unicode_index != -1 and unicode_index + 2 < len(row):
        try:
            # Extract the second element after the Unicode character
            second_value_after_unicode = float(row[unicode_index + 2].replace('<', ''))
            
            # Only keep rows where the second value after Unicode is <= 1e-4
            if second_value_after_unicode < 1e-3:
                second_value_after_unicode = second_value_after_unicode/1e3 # Convert Sv/Bq to uSv/GBq
                final_data.append([row[0], row[1], row[unicode_index + 2]])  # Keep the second value after Unicode
        except ValueError:
            # If conversion to float fails, skip this row
            continue


# Define file paths for each output txt file
def format_row(row):
    # Check if the row has at least 3 elements before formatting
    if len(row) >= 3:
        return "{:<5} {:<10} {:>15}".format(row[0], row[1], row[2])
    else:
        return ""  # Return an empty string if the row doesn't have enough elements

# Write the final_data to a txt file
output_file_path = "example/DoesPar/PRO_814_501.txt"
with open(output_file_path, "w") as output_file:
    for row in final_data:
        formatted_row = format_row(row)
        if formatted_row:  # Only write non-empty rows
            output_file.write(formatted_row + "\n")