In [5]:
import pandas as pd
import os

# Define the list of desired atom types
DESIRED_ATOMS = ['H', 'C', 'N', 'O']  # Modify according to actual requirements

# Define file paths
DATA_DIR = 'data'
ATOM_FILE = '../data/atomData.csv'
PAIR_FILE = '../data/atomPairData.csv'

# ====================== Atom Data Extraction ======================
# Read atom data directly (no file modification)
atom_df = pd.read_csv(ATOM_FILE)

# Filter for desired atoms
filtered_atoms = atom_df[atom_df['symbol'].isin(DESIRED_ATOMS)].copy()

# Display atom data summary
print("="*50)
print(f"Atom data extraction complete: Found {len(atom_df)} elements in total")
print(f"Retained {len(filtered_atoms)} desired elements: {', '.join(DESIRED_ATOMS)}")
print("="*50)

# ====================== Atom Pair Data Extraction ======================
# Read atom pair data directly (no file modification)
pair_df = pd.read_csv(PAIR_FILE)

# Create pair identifier (to handle order variability)
def create_pair_id(row):
    """Create standardized pair identifier (sorted alphabetically)"""
    elements = sorted([row['element1'], row['element2']])
    return f"{elements[0]}-{elements[1]}"

# Add pair identifier column
pair_df['pair_id'] = pair_df.apply(create_pair_id, axis=1)

# Filter for desired atom pairs
pair_condition = (
    pair_df['element1'].isin(DESIRED_ATOMS) & 
    pair_df['element2'].isin(DESIRED_ATOMS)
)
filtered_pairs = pair_df[pair_condition].copy()

# Display atom pair data summary
print(f"\nAtom pair data extraction complete: Found {len(pair_df)} atom pairs in total")
print(f"Retained {len(filtered_pairs)} desired atom pairs")
print("="*50)

# List all found atom pairs
unique_pairs = filtered_pairs['pair_id'].unique()
print(f"Identified atom pair combinations: {', '.join(unique_pairs)}")
print("="*50)

# ====================== Save Extraction Results ======================
# Create output directory
output_dir = 'extracted_parameters'
os.makedirs(output_dir, exist_ok=True)

# Save results (preserve original data)
filtered_atoms.to_csv(os.path.join(output_dir, 'atom_parameters.csv'), index=False)
filtered_pairs.to_csv(os.path.join(output_dir, 'pair_parameters.csv'), index=False)

# Generate report
report = f"""
{'='*50}
Data Extraction Report
{'='*50}
Desired atoms: {', '.join(DESIRED_ATOMS)}
Atom parameters: {len(filtered_atoms)} elements
Atom pair parameters: {len(filtered_pairs)} pairs
Unique atom pair combinations: {len(unique_pairs)} types

Output directory: {output_dir}
  - Atom parameters: atom_parameters.csv
  - Atom pair parameters: pair_parameters.csv
{'='*50}
"""

print(report)

# Display extracted data summary
print("\nAtom parameters summary:")
print(filtered_atoms[['symbol', 'default_charge', 'default_A', 'default_rho', 'vdw_radius']].to_string(index=False))

print("\nAtom pair parameters summary:")
if not filtered_pairs.empty:
    print(filtered_pairs[['element1', 'element2', 'A', 'rho', 'C']].head(10).to_string(index=False))
else:
    print("No matching atom pairs found")

Atom data extraction complete: Found 42 elements in total
Retained 4 desired elements: H, C, N, O

Atom pair data extraction complete: Found 50 atom pairs in total
Retained 10 desired atom pairs
Identified atom pair combinations: O-O, C-C, H-O, N-O, C-O, C-H, H-N, N-N, C-N, H-H

Data Extraction Report
Desired atoms: H, C, N, O
Atom parameters: 4 elements
Atom pair parameters: 10 pairs
Unique atom pair combinations: 10 types

Output directory: extracted_parameters
  - Atom parameters: atom_parameters.csv
  - Atom pair parameters: pair_parameters.csv


Atom parameters summary:
symbol  default_charge  default_A  default_rho  vdw_radius
     H               0      100.0        0.100        1.20
     C               0     3500.0        0.180        1.70
     N              -3      923.0        0.325        1.55
     O              -2    22764.0        0.149        1.52

Atom pair parameters summary:
element1 element2        A   rho     C
       O        O 22764.00 0.149 27.88
       C      