In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import MACCSkeys
import matplotlib.pyplot as plt
import seaborn as sns
import os
from mendeleev import element
import numpy as np

#### the fragments in the linker unit calculated using RDKit descriptors

In [3]:
def compute_fr_descriptors(df, save_path):
    """
    Extracts all molecular descriptors starting with `fr_` from a DataFrame containing SMILES and saves them as a CSV file.
    
    Parameters:
        df: pd.DataFrame, must include columns 'mof', 'smiles1', 'smiles2'
        save_path: str, the path to save the output file
        
    Returns:
        None (saves the results to a CSV file)
    """
    # Combine SMILES strings
    df['combined_smiles'] = df[['smiles1', 'smiles2']].apply(
        lambda row: '.'.join([smiles for smiles in row if pd.notna(smiles) and smiles != '']),
        axis=1
    )
    smiles_des = df[['mof', 'combined_smiles']]

    # Get all fr_ descriptors
    fr_descriptors = [desc_name for desc_name, _ in Descriptors.descList if desc_name.startswith("fr_")]

    # Function to calculate fr_ descriptors
    def calculate_fr_descriptors(smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return {desc: None for desc in fr_descriptors}
        return {desc: getattr(Descriptors, desc)(mol) for desc in fr_descriptors}

    # Batch calculation
    df_fr_values = smiles_des['combined_smiles'].apply(calculate_fr_descriptors)
    fr_df = pd.DataFrame(df_fr_values.tolist())

    # Merge results
    result_df = pd.concat([smiles_des[['mof']], fr_df], axis=1)
    result_df.to_csv(save_path, index=False)
    print(f"Descriptor file saved to: {save_path}")

In [None]:
df = pd.read_csv("your_input.csv")  # your filename, smiles1, smiles2 csv
compute_fr_descriptors(df, "fr_descriptors.csv")

#### the metal in the secondary building unit

In [2]:
def extract_metal_properties(df, save_path, metal_column='All_Metals'):
    """
    Extracts metal properties (e.g., atomic radius, electronegativity) based on metal element combinations in a DataFrame and saves them to a CSV file.

    Parameters:
        df: DataFrame containing 'filename' and metal element information
        save_path: Path to save the CSV file
        metal_column: Column name containing metal elements (can be a string or list), defaults to 'All_Metals'
    """
    # Define properties to extract
    properties = ['atomic_radius', 'electronegativity', 'electron_affinity', 
                  'ionization_energy', 'atomic_weight', 'oxistates']

    # Define function to extract properties
    def get_element_properties(metals):
        radii, energies, affinities, electronegativities, weights, oxistates = [], [], [], [], [], []
        for metal in metals:
            try:
                elem = element(str(metal))
                radii.append(float(elem.atomic_radius) / 100 if elem.atomic_radius else np.nan)
                energies.append(elem.ionenergies.get(1, np.nan))
                affinities.append(float(elem.electron_affinity) if elem.electron_affinity else np.nan)
                electronegativities.append(float(elem.en_pauling) if elem.en_pauling else np.nan)
                weights.append(float(elem.atomic_weight) if elem.atomic_weight else np.nan)
                oxistates.append(elem.oxistates[-1] if elem.oxistates else np.nan)
            except Exception as e:
                print(f"Error processing element {metal}: {e}")
                radii.append(np.nan)
                energies.append(np.nan)
                affinities.append(np.nan)
                electronegativities.append(np.nan)
                weights.append(np.nan)
                oxistates.append(np.nan)
        return {
            'atomic_radius': np.nanmean(radii),
            'ionization_energy': np.nanmean(energies),
            'electron_affinity': np.nanmean(affinities),
            'electronegativity': np.nanmean(electronegativities),
            'atomic_weight': np.nanmean(weights),
            'oxistates': np.nanmean(oxistates)
        }
    
    # Extract metal properties and assemble into DataFrame
    result_rows = []
    for _, row in df[['mof', metal_column]].iterrows():
        metals = str(row[metal_column]).split(',')  # Assumes metals are comma-separated
        properties_dict = get_element_properties(metals)
        result_row = {'mof': row['mof'], **properties_dict}
        result_rows.append(result_row)
    result_df = pd.DataFrame(result_rows)
    result_df.to_csv(save_path, index=False)
    print(f"Metal descriptor file saved to: {save_path}")

In [None]:
df = pd.read_csv("your_input_file.csv")  # Must contain 'mof' and 'All_Metals' columns
df['All_Metals'] = df['metal1'] + df['metal2'].fillna('').apply(lambda x: '' if x == '' else ',' + x)
extract_metal_properties(df, "metal_properties.csv", metal_column='All_Metals')

#### structural characteristics of the MOFs

calculated using zeo++:
- 'LCD', 
'PLD', 'desity(g/cm^3)','VSA(m^2/cm^3)', 'GSA(m^2/g)', 'Vp(cm^3/g)', 'void_fraction'

In [None]:
# Specify the path to the original DataFrame file
file_path = 'input_file.csv'  # Contains geometric properties calculated by zeo++

# Define column names to extract
columns_to_extract = [
    'mof', 'LCD', 'PLD', 'desity(g/cm^3)',
    'VSA(m^2/cm^3)', 'GSA(m^2/g)', 'Vp(cm^3/g)', 'void_fraction'
]

# Read file and extract specified columns
df = pd.read_csv(file_path)  # Use pd.read_csv for .csv files
selected_df = df[columns_to_extract]

# Save as new file
selected_df.to_csv('global.csv', index=False)

#### other physical conditions

- gas_type
- temperature

In [16]:
gas_weights = {
    'CH4': 16.04,
    'CO2': 44.01,
    'C2H6': 30.07,
    'C2H4': 28.05,
    'C3H8': 44.10,
    'C3H6': 42.08,
    'N2': 28.01,
    'O2': 32.00,
    'H2': 2.02,
    'Ar': 39.95,
    'SF6': 146.06,
    'Xe': 131.29,
    'Kr': 83.80,
    'Ne': 20.18,
    'C4H10': 58.12,
}

In [15]:
df = pd.read_csv('input.csv')  # Contains gas type and temperature
df['gastype'] = df['gastype'].str.strip().str.upper().apply(lambda x: gas_weights.get(x, np.nan))
df = df[['mof','gastype','temperature']]
df.to_csv('label.csv', index=False)

### Data Processing Pipeline

We have computed the following descriptor sets separately:
1. **Organic ligand descriptors** - Molecular features of the linker components
2. **Metal descriptors** - Physical/chemical properties of metal nodes
3. **Structural descriptors** - Geometric properties calculated by zeo++
4. **Experimental parameters** - Gas species and temperature conditions

#### File Requirements:
- All output files must contain a `mof` column (case-sensitive) as the merge key
- Files should be saved in CSV format for consistency


#### Important Notes:
⚠️ **Data Quality Check**:
- Metal descriptors may contain null values (particularly for uncommon elements)
- Recommended approaches for handling missing data:
  - Manual curation using periodic table references