### $\underline{\textbf{Orbitals data extraction}}$:
##### $\textbf{Part A}$: Extracting text from images (RHF_tables $\rightarrow$ orbitals_data.txt)

In [1]:
from PIL import Image, ImageEnhance, ImageFilter
import pytesseract
import re

def preprocess_image(image_path):
    """Converts image to grayscale, enhances contrast, and reduces noise."""
    img = Image.open(image_path)
    img = img.convert('L')
    img = ImageEnhance.Contrast(img).enhance(2.0)
    img = img.filter(ImageFilter.SMOOTH)
    return img

def postprocess_text(text):
    """Corrects common OCR errors in atomic physics notation."""
    corrections = {
        r'2=': 'Z=',
        r'is\(2\)': '1s(2)',
        r'RHOatO': 'RHOat0',
        r'\b38\b': '3S',
        r'\b3s\b': '3S',
        r'\b1s\b': '1S',
        r'\b2s\b': '2S'
    }
    for pattern, replacement in corrections.items():
        text = re.sub(pattern, replacement, text)
    return text.strip()

def extract_data_from_image(image_path):
    """Extracts and cleans text from a single image."""
    processed_img = preprocess_image(image_path)
    custom_config = r'--oem 3 --psm 6'
    raw_text = pytesseract.image_to_string(processed_img, config=custom_config)
    return postprocess_text(raw_text)

# List of images to process (add more as needed)
image_files = [
    "/home/konsster/Desktop/AUThPhysDepart/Computational_Quantum_Physics_and_Applications/Project_1/data/RHF_tables/He.png",
    "/home/konsster/Desktop/AUThPhysDepart/Computational_Quantum_Physics_and_Applications/Project_1/data/RHF_tables/Li.png",
    "/home/konsster/Desktop/AUThPhysDepart/Computational_Quantum_Physics_and_Applications/Project_1/data/RHF_tables/Be.png",
    "/home/konsster/Desktop/AUThPhysDepart/Computational_Quantum_Physics_and_Applications/Project_1/data/RHF_tables/B.png",
    "/home/konsster/Desktop/AUThPhysDepart/Computational_Quantum_Physics_and_Applications/Project_1/data/RHF_tables/C.png",
    "/home/konsster/Desktop/AUThPhysDepart/Computational_Quantum_Physics_and_Applications/Project_1/data/RHF_tables/N.png",
    "/home/konsster/Desktop/AUThPhysDepart/Computational_Quantum_Physics_and_Applications/Project_1/data/RHF_tables/O.png",
    "/home/konsster/Desktop/AUThPhysDepart/Computational_Quantum_Physics_and_Applications/Project_1/data/RHF_tables/F.png",
    "/home/konsster/Desktop/AUThPhysDepart/Computational_Quantum_Physics_and_Applications/Project_1/data/RHF_tables/Ne.png"
]

# Process all images and combine results
all_data = []
for img_file in image_files:
    element_name = img_file.split('/')[-1].replace('.png', '')
    extracted_text = extract_data_from_image(img_file)
    all_data.append(f"\n\n----- {element_name.upper()} DATA -----\n{extracted_text}")

# Save to file
output_file = "orbitals_data.txt"
with open(output_file, 'w') as f:
    f.write("\n".join(all_data))

print(f"Data from {len(image_files)} images saved to {output_file}")

Data from 9 images saved to orbitals_data.txt


#### $\textbf{Part B}$: Converting text to data (orbitals_data.txt $\rightarrow$ orbitals_data.csv)

In [2]:
import re
from collections import defaultdict

def process_element_section(element_name, content):
    """Processes orbital data for a single element and returns sorted data"""
    orbitals_data = defaultdict(list)

    for line in content.split('\n'):
        # Skip header lines (like "1S 2S 2p")
        if re.match(r'^\d+[SsPp]\s+\d+[SsPp]', line):
            continue

        # Clean the line from special characters
        clean_line = line.replace('<', ' ').replace('~', ' ').replace('+-', '-').strip()
        parts = re.split(r'\s+', clean_line)

        if len(parts) >= 3 and re.match(r'^\d+[SsPp]$', parts[0]):
            orbital = parts[0].lower()
            z_coeff = parts[1]
            coeff1 = parts[2]
            coeff2 = parts[3] if len(parts) >= 4 else ""

            data = {'z_coeff': z_coeff, '1s': '', '2s': '', '2p': ''}

            if orbital == '1s':
                data['1s'] = coeff1
                data['2s'] = coeff2
            elif orbital == '2s':
                data['2s'] = coeff2 if coeff2 else coeff1
                data['1s'] = coeff1 if coeff2 else ''
            elif orbital == '2p':
                data['2p'] = coeff1
            elif orbital == '3s':
                data['1s'] = coeff1
                data['2s'] = coeff2

            orbitals_data[orbital].append(data)

            # Handle second orbital on same line (typically p)
            if len(parts) >= 6 and re.match(r'^\d+[Pp]$', parts[4]):
                p_orbital = parts[4].lower()
                p_z = parts[5]
                p_coeff = parts[6] if len(parts) >= 7 else ""

                orbitals_data[p_orbital].append({
                    'z_coeff': p_z,
                    '1s': '',
                    '2s': '',
                    '2p': p_coeff
                })

    # Sort orbitals in desired order
    sorted_orbitals = []
    for orb in ['1s', '3s', '2s', '2p']:
        if orb in orbitals_data:
            for data in orbitals_data[orb]:
                sorted_orbitals.append((orb, data))

    return sorted_orbitals


# Open the input and output files
with open('orbitals_data.txt', 'r') as infile, open('orbitals_data.csv', 'w') as outfile:
    # Write CSV header
    outfile.write("Element,orbitals,z_coeff,coeff(1s),coeff(2s),coeff(2p)\n")

    current_element = None
    element_content = []

    for line in infile:
        if line.startswith('----- '):
            if current_element and element_content:
                sorted_data = process_element_section(current_element, '\n'.join(element_content))
                for orb, data in sorted_data:
                    out_line = f"{current_element},{orb},{data['z_coeff']},{data['1s']},{data['2s']},{data['2p']}"
                    outfile.write(out_line + "\n")

            element_name = line.split(' ')[1].strip().capitalize()
            current_element = element_name
            element_content = []
        else:
            if current_element:
                element_content.append(line.strip())

    # Process last element
    if current_element and element_content:
        sorted_data = process_element_section(current_element, '\n'.join(element_content))
        for orb, data in sorted_data:
            out_line = f"{current_element},{orb},{data['z_coeff']},{data['1s']},{data['2s']},{data['2p']}"
            outfile.write(out_line + "\n")

print("Data successfully saved to orbitals_data.csv")


Data successfully saved to orbitals_data.csv


### $\underline{\textbf{Entropy data extraction}}$:

In [4]:
import tabula
import pandas as pd
import sys
import os

# Redirect stderr to suppress Java warnings
sys.stderr = open(os.devnull, 'w')

# Your existing code here
pdf_path = "/home/konsster/Desktop/AUThPhysDepart/Computational_Quantum_Physics_and_Applications/Project_1/data/information_entropy.pdf"
page_num = 9
output_csv = "entropy_data.csv"

tables = tabula.read_pdf(pdf_path, pages=page_num, multiple_tables=True, lattice=False)

if tables:
    df = tables[0]
    df.columns = ["Z", "Element", "Sr", "Sk", "S", "Smax", "Omega", "O"]
    
    numeric_cols = ["Sr", "Sk", "S", "Smax", "Omega", "O"]
    for col in numeric_cols:
        if df[col].dtype == object:
            df[col] = df[col].astype(str).str.replace(" ", "").astype(float)
    
    df["Z"] = df["Z"].astype(int)
    df["Element"] = df["Element"].astype(str)
    
    df.to_csv(output_csv, index=False)
    print(f"Table successfully saved to {output_csv}")
    print("\nData preview:")
    print(df.head())
    
else:
    print("No tables found on the specified page.")

# Restore stderr
sys.stderr = sys.__stderr__

Table successfully saved to entropy_data.csv

Data preview:
   Z Element       Sr       Sk        S     Smax    Omega          O
0  1       H  4.14473  2.42186  6.56659   7.9054  0.21113  120.26700
1  2      He  2.69851  3.91342  6.61193   7.0493  0.06204  100.36100
2  3      Li  3.07144  3.99682  7.69826  10.3578  0.25677    9.15713
3  4      Be  3.62386  4.19019  7.81405  10.3950  0.24829    8.45434
4  5       B  3.40545  4.70590  8.11135  10.3738  0.21810   15.96530
