## Half-Life Collection
---
### Read half-life information from NNDC:
1. Go to Nuclear Levels and Gammas Search;
2. Give the range for Z or A or N to search the result (I usually set half-live range > 1 sec);
3. Save the result page as PDF (Right click -> choose print -> Save as PDF);

* No space in any path

In [7]:
import pdfplumber
import re
import os
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Font, PatternFill, Border, Alignment

### Convert PDF to txt

In [14]:
def Convert_PDF_to_TXT(pdf_path, flag_overwrite = False):
    # Get the base name of the PDF file (without the extension) using string operations
    base_name = pdf_path.split('/')[-1].replace('.pdf', '')
    # Create the output TXT file name
    txt_path = pdf_path.replace('.pdf', '.txt')
    
    # if txt file exists and don't need to be replaced, return empty
    if os.path.exists(txt_path) and flag_overwrite:
        return
    # if txt file doesn't exist or you want to overwrite it
    else:
        # Open the PDF file
        with pdfplumber.open(pdf_path) as pdf:
            text_to_write = []
            capture_text = False # will be "True" when detect "Nucleus E level(keV) Jπ T 1/2" where start extracting info
            # Iterate through each page
            for page in pdf.pages:
                # Extract the text from the page
                text = page.extract_text() 
                # Split the text into lines
                lines = text.split('\n')
                for line in lines:
                    # Skip the unwanted header and footer lines
                    #if "page" in line.lower() and "," in line and ("AM" in line or "PM" in line):
                    if "AM" in line or "PM" in line:
                        continue
                    if "https://www.nndc.bnl.gov" in line:
                        continue     
                    # Start capturing text after the target start line
                    if "Nucleus E level(keV) Jπ T 1/2" in line:
                        capture_text = True
                        continue
                    # Stop capturing text after the target end line
                    if "Gamma Information" in line:
                        capture_text = False
                        continue
                    if "≥" in line or ">" in line:
                        continue
                    if "eV" in line:
                        continue
                    # Capture the text
                    if capture_text:
                        text_to_write.append(line)
        # Write the captured text to the dynamically named txt file
        with open(txt_path, "w") as output_file:
            output_file.write("\n".join(text_to_write))

### Extract A, nucleus_name, t1/2_value and t1/2_unit into an array

In [19]:
# ============= Function to parse the t_half value into t_value + t_unit ==============#
# the input "t_half" should have one character (no-digit) part
def parse_t_half(t_half):
    # if t1/2 = stable, will return the age of the universe
    if t_half.upper() == "STABLE":
        return 1.4e10, "y"
    matches = re.findall(r'(\d+(?:\.\d+)?(?:e[-+]?\d+)?)\s*([a-zA-Z]+)', t_half)
    # Initialize variables for the extracted values
    t_value = -1
    t_unit = None
    # If matches are found, assign the number and unit
    if matches:
        t_value = float(matches[0][0])  # First number found
        t_unit = matches[0][1]    # Corresponding unit
    return t_value, t_unit

# ============= Function to extract t1/2 from txt ==============#
# extracted_data = [A,"nucleus_name", t1/2, "unit"]
def Extract_HalfLife(txt_path):
    # Open the existing TXT file and process it
    with open(txt_path, "r") as input_file:
        extracted_data = []
        # Skip the first line (header)
        next(input_file)
        for line in input_file:
            # Skip any empty lines
            if not line.strip():
                continue
            # Split the line into columns
            columns = line.split()
            # Extract Nucleus (separated into number and letters, like "1H" to "1" and "H")
            nucleus_full = columns[0]
            nucleus_number = ''.join([char for char in nucleus_full if char.isdigit()])
            nucleus_letters = ''.join([char for char in nucleus_full if char.isalpha()])
            # Extract and parse T 1/2
            t_half = ' '.join(columns[3:])
            value, unit = parse_t_half(t_half)
            if value > 0:
                # Append the extracted data as a tuple
                extracted_data.append((float(nucleus_number), nucleus_letters, value, unit))
        return extracted_data

### Fill t1/2 info to Excel

In [26]:
#============= convert all half-lives to seconds =============#
def HalfLife_Unit_Factor(unit):
    fac = 0
    if unit == "s":
        fac = 1;
    if unit == "m":
        fac = 60;
    if unit == "h":
        fac = 60*60;
    if unit == "d":
        fac = 24*60*60;
    if unit == "y":
        fac = 365*24*60*60;
    return fac

#============= Fill Half-Life to one sheet in Excel =============#
def Fill_HalfLife_Sheet(excel_path, sheet_name):
    sheets = pd.read_excel(excel_path, sheet_name=None)  # Load all sheets
    df = sheets[sheet_name]
    # Add new columns for "t1/2" and "unit" if they don't already exist
    if "t1/2" not in df.columns:
        df["t1/2"] = -1.0
    if "unit" not in df.columns:
        df["unit"] = ""
    if "t1/2(s)" not in df.columns:
        df["t1/2(s)"] = -1.0
    # Iterate over each entry in the extracted_data array
    for data in extracted_data:
        A = data[0]       # A
        isotope = data[1] # "Isotope"
        t_value = data[2] # t1/2
        t_unit = data[3]  # "unit"
        # Loop over all rows in the DataFrame to check for matches
        for idx in range(len(df)):
            if df.at[idx, "A"] == A and df.at[idx, "isotope"].lower() == isotope.lower():
                # Fill the first matching row with t1/2 and unit data
                if df.at[idx, "t1/2"] < 0:
                    df.at[idx, "t1/2"] = t_value
                    df.at[idx, "unit"] = t_unit
                    df.at[idx, "t1/2(s)"] = t_value * HalfLife_Unit_Factor(t_unit)
                    break
                else:
                    # If there are already values, create a new row
                    new_row = df.loc[idx].copy()
                    new_row["t1/2"] = t_value
                    new_row["unit"] = t_unit
                    new_row["t1/2(s)"] = t_value * HalfLife_Unit_Factor(t_unit)
                    # Append " isomer" to the Isotope in the new row
                    new_row["isotope"] = f"{new_row['isotope']} isomer"
                    df = pd.concat([df.iloc[:idx + 1], pd.DataFrame([new_row]), df.iloc[idx + 1:]]).reset_index(drop=True)
                    break
    # Save df back to Excel
    with pd.ExcelWriter(excel_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
        df.to_excel(writer, sheet_name=sheet_name, index=False)
    # Reopen the workbook to apply formatting
    workbook = load_workbook(excel_path)
    # Set column width and alignment for the new sheet
    worksheet = workbook[sheet_name]
    for column in worksheet.columns:
        column_letter = column[0].column_letter
        worksheet.column_dimensions[column_letter].width = 16
        for cell in column:  # Align each cell in the column
            cell.alignment = Alignment(horizontal='right')
    # Save the workbook
    workbook.save(excel_path)

#========================== Fill Half-Life to all sheets ("MeV") in Excel =============================#
def Fill_HalfLife_Excel(excel_path):
    workbook = load_workbook(excel_path)
    sheet_names = workbook.sheetnames   
    # Iterate over all sheets and call Fill_HalfLife_Sheet() if the sheet name includes "MeV"
    for sheet_name in sheet_names:
        if "MeV" in sheet_name:
            Fill_HalfLife_Sheet(excel_path, sheet_name)

In [27]:
Fill_HalfLife_Excel("/Users/yiyizhu/Packages/DoesCal/example/DoesCal.xlsx")