In [1]:
from pymatgen.core import Structure
from mendeleev import element
import itertools
import pandas as pd
from collections import Counter

class BondAnalyzer:
    def __init__(self, cif_file, structure, tolerance=0.4, n_shortest=4):
        """
        Initialize the bond analyzer.
        """
        if cif_file is not None:
            try:
                self.cif_file = cif_file
                self.struct = Structure.from_file(cif_file)
            except Exception as e:
                print(f"‚ö†Ô∏è Reading cif file is failed for {identifier}: {e}")
        else:
            self.struct = structure
        self.tolerance = tolerance
        self.n_shortest = n_shortest
        

        # Metals list
        self.metals = [
            "Li","Be","Na","Mg","K","Ca","Al","Sc","Ti","V","Cr","Mn","Fe","Co","Ni","Cu","Zn",
            "Ag","Au","Ga","In","Sn","Tl","Pb","Bi",
            "La","Ce","Pr","Nd","Pm","Sm","Eu","Gd","Tb","Dy","Ho","Er","Tm","Yb","Lu",
            "Ac","Th","Pa","U","Np","Pu","Am","Cm","Bk","Cf","Es","Fm","Md","No","Lr"
        ]

        # Assign atom labels
        self.atom_labels = self._assign_atom_labels()
        self.bonds_list = self._generate_bonds()

    def _assign_atom_labels(self):
        """Assign numbered labels like Ag1, Ag2, O1, etc."""
        element_counts = {}
        labels = []
        for site in self.struct:
            symbol = site.specie.symbol
            element_counts[symbol] = element_counts.get(symbol, 0) + 1
            labels.append(f"{symbol}{element_counts[symbol]}")
        return labels

    def _generate_bonds(self):
        """Generate list of all possible bonds based on distance and bonding radii."""
        bonds_list = []
        seen = set()

        for i, j in itertools.combinations(range(len(self.struct)), 2):
            site_i, site_j = self.struct[i], self.struct[j]
            label_i, label_j = self.atom_labels[i], self.atom_labels[j]

            d = site_i.distance(site_j)
            d_round = round(d, 3)

            el_i = element(site_i.specie.symbol)
            el_j = element(site_j.specie.symbol)

            r_i = el_i.metallic_radius if el_i.symbol in self.metals else el_i.covalent_radius
            r_j = el_j.metallic_radius if el_j.symbol in self.metals else el_j.covalent_radius
            if r_i is None or r_j is None:
                continue

            if d <= (r_i + r_j + self.tolerance):
                # Determine bond type
                if el_i.symbol in self.metals and el_j.symbol in self.metals:
                    bond_type = "metallic"
                else:
                    en_i = el_i.electronegativity("pauling")
                    en_j = el_j.electronegativity("pauling")
                    if en_i is None or en_j is None:
                        bond_type = "unknown"
                    else:
                        delta_en = abs(en_i - en_j)
                        if delta_en < 0.4:
                            bond_type = "covalent"
                        elif delta_en < 1.7:
                            bond_type = "polar covalent"
                        else:
                            bond_type = "ionic"

                key = tuple(sorted([label_i, label_j]))
                if key in seen:
                    continue
                seen.add(key)

                bonds_list.append({
                    "Atom 1": label_i,
                    "Atom 2": label_j,
                    "Distance (√Ö)": d_round,
                    "Bond type": bond_type
                })
        return bonds_list

    def analyze(self, target_species):
        """
        Analyze shortest bonds separately for each target species.
        Returns:
            final_dfs (dict): DataFrame for each species.
            avg_nn_bl_dict (dict): Average NN bond length for each species.
            neighbor_summary_all (dict): Neighbor atoms per atom.
            neighbor_counts_all (dict): Neighbor element counts per atom.
        """
        df = pd.DataFrame(self.bonds_list)
        if df.empty:
            print("‚ö†Ô∏è No bonds found. Check tolerance or CIF structure.")
            return {}, {}, {}, {}

        final_dfs = {}
        avg_nn_bl_dict = {}
        neighbor_summary_all = {}
        neighbor_counts_all = {}

        for species in target_species:
            results = []
            neighbor_summary = {}

            for atom in df["Atom 1"].unique():
                if atom.startswith(species):
                    atom_bonds = df[(df["Atom 1"] == atom) | (df["Atom 2"] == atom)]
                    shortest = atom_bonds.sort_values("Distance (√Ö)").head(self.n_shortest)
                    results.append(shortest)

                    neighbors = []
                    for _, row in shortest.iterrows():
                        other_atom = row["Atom 2"] if row["Atom 1"] == atom else row["Atom 1"]
                        if not other_atom.startswith(species):
                            neighbors.append(other_atom)
                    neighbor_summary[atom] = neighbors

            if not results:
                print(f"‚ö†Ô∏è No bonds found for species {species}.")
                continue

            final_df = pd.concat(results).drop_duplicates().reset_index(drop=True)
            avg_nn_bl = final_df["Distance (√Ö)"].mean()
            avg_nn_bl_dict[species] = avg_nn_bl

            neighbor_counts = {}
            for atom, neigh_list in neighbor_summary.items():
                symbols = [''.join(filter(str.isalpha, x)) for x in neigh_list]
                neighbor_counts[atom] = dict(Counter(symbols))

            output_file = f"shortest_bonds_{species}.csv"
            final_df.to_csv(output_file, index=False)
           # print(f"‚úÖ Saved results for {species} ‚Üí {output_file}")

            final_dfs[species] = final_df
            neighbor_summary_all[species] = neighbor_summary
            neighbor_counts_all[species] = neighbor_counts

        print("\nüîπ Average NN bond lengths per species:")
        #for sp, avg_val in avg_nn_bl_dict.items():
           # print(f"  {sp}: {avg_val:.3f} √Ö")

        return final_dfs, avg_nn_bl_dict, neighbor_summary_all, neighbor_counts_all


In [2]:
# ========================
# Example usage
# ========================

cif_file = "./hubbard_structures_cifs/mp-6031.cif"
structure ="None"
analyzer = BondAnalyzer(cif_file,structure, tolerance=0.4, n_shortest=4)
target_species = ["Li"]  # multiple entries possible
final_dfs, avg_nn_bl_dict, neighbor_summary_all, neighbor_counts_all = analyzer.analyze(target_species)
print(final_dfs)
print(avg_nn_bl_dict)
print(neighbor_summary_all)
print(neighbor_counts_all)

#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
'''for sp, df in final_dfs.items():
    print(f"\nüîπ Bonds for {sp}:\n", df)
    print("\nAverage NN bond lengths:", avg_nn_bl_dict)'''



üîπ Average NN bond lengths per species:
{'Li':    Atom 1 Atom 2  Distance (√Ö) Bond type
0     Li1     N3         2.159     ionic
1     Li1     N5         2.159     ionic
2     Li1     N4         2.159     ionic
3     Li1     N2         2.313     ionic
4     Li2     N7         2.159     ionic
..    ...    ...           ...       ...
59   Li15     N3         2.338     ionic
60   Li16     N3         2.058     ionic
61   Li16     N7         2.125     ionic
62   Li16     N4         2.176     ionic
63   Li16     N5         2.338     ionic

[64 rows x 4 columns]}
{'Li': np.float64(2.157375)}
{'Li': {'Li1': ['N3', 'N5', 'N4', 'N2'], 'Li2': ['N7', 'N6', 'N8', 'N1'], 'Li3': ['O1', 'N5', 'N4', 'N3'], 'Li4': ['O1', 'N7', 'N6', 'N8'], 'Li5': ['N2', 'O1', 'N5', 'N1'], 'Li6': ['N2', 'O1', 'N3', 'N1'], 'Li7': ['N2', 'O1', 'N4', 'N1'], 'Li8': ['N1', 'O1', 'N8', 'N2'], 'Li9': ['N1', 'O1', 'N6', 'N2'], 'Li10': ['N1', 'O1', 'N7', 'N2'], 'Li11': ['N5', 'N6', 'N3', 'N4'], 'Li12': ['N7', 'N5', 'N8', 'N6'

'for sp, df in final_dfs.items():\n    print(f"\nüîπ Bonds for {sp}:\n", df)\n    print("\nAverage NN bond lengths:", avg_nn_bl_dict)'

In [None]:
!jupyter nbconvert --to python bonds.ipynb

In [5]:
'''from pymatgen.core import Structure
from mendeleev import element
import itertools
import pandas as pd
from collections import Counter
import re

class BondAnalyzer2:
    def __init__(self, cif_file=None, structure=None, tolerance=0.4, n_shortest=4):
        """
        Initialize the bond analyzer.
        Parameters:
            cif_file (str): Path to the CIF file.
            structure (Structure): Optional pymatgen Structure object.
            tolerance (float): Bond length tolerance.
            n_shortest (int): Number of shortest bonds to consider.
        """
        if cif_file is not None:
            try:
                self.cif_file = cif_file
                self.struct = Structure.from_file(cif_file)
            except Exception as e:
                print(f"‚ö†Ô∏è Reading CIF file failed: {e}")
        elif structure is not None:
            self.struct = structure
        else:
            raise ValueError("You must provide either a CIF file or a pymatgen Structure object.")

        self.tolerance = tolerance
        self.n_shortest = n_shortest

        # Metals list
        self.metals = [
            "Li", "Be", "Na", "Mg", "K", "Ca", "Al", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
            "Ag", "Au", "Ga", "In", "Sn", "Tl", "Pb", "Bi",
            "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
            "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr"
        ]

        # Assign atom labels and bonds
        self.atom_labels = self._assign_atom_labels()
        self.bonds_list = self._generate_bonds()

    def _assign_atom_labels(self):
        """Assign numbered labels like Ag1, Ag2, O1, etc."""
        element_counts = {}
        labels = []
        for site in self.struct:
            symbol = site.specie.symbol
            element_counts[symbol] = element_counts.get(symbol, 0) + 1
            labels.append(f"{symbol}{element_counts[symbol]}")
        return labels

    def _generate_bonds(self):
        """Generate all possible bonds based on distance and bonding radii."""
        bonds_list = []
        seen = set()

        for i, j in itertools.combinations(range(len(self.struct)), 2):
            site_i, site_j = self.struct[i], self.struct[j]
            label_i, label_j = self.atom_labels[i], self.atom_labels[j]
            d = site_i.distance(site_j)
            d_round = round(d, 3)

            el_i = element(site_i.specie.symbol)
            el_j = element(site_j.specie.symbol)

            r_i = el_i.metallic_radius if el_i.symbol in self.metals else el_i.covalent_radius
            r_j = el_j.metallic_radius if el_j.symbol in self.metals else el_j.covalent_radius
            if r_i is None or r_j is None:
                continue

            if d <= (r_i + r_j + self.tolerance):
                # Determine bond type
                if el_i.symbol in self.metals and el_j.symbol in self.metals:
                    bond_type = "metallic"
                else:
                    en_i = el_i.electronegativity("pauling")
                    en_j = el_j.electronegativity("pauling")
                    if en_i is None or en_j is None:
                        bond_type = "unknown"
                    else:
                        delta_en = abs(en_i - en_j)
                        if delta_en < 0.4:
                            bond_type = "covalent"
                        elif delta_en < 1.7:
                            bond_type = "polar covalent"
                        else:
                            bond_type = "ionic"

                key = tuple(sorted([label_i, label_j]))
                if key in seen:
                    continue
                seen.add(key)

                bonds_list.append({
                    "Atom 1": label_i,
                    "Atom 2": label_j,
                    "Distance (√Ö)": d_round,
                    "Bond type": bond_type
                })
        return bonds_list

    def analyze(self, target_species):
        """
        Analyze shortest bonds for one or more target species.

        Behavior:
          - If a target species includes a digit (e.g. 'Li1'), it matches only that atom.
          - If it includes only letters (e.g. 'Li'), it matches all atoms of that element ('Li1', 'Li2', ...).
          - You can mix both (e.g., ['Li', 'Li1', 'Al2', 'Al']).
        """
        df = pd.DataFrame(self.bonds_list)
        if df.empty:
            print("‚ö†Ô∏è No bonds found. Check tolerance or CIF structure.")
            return {}, {}, {}, {}

        final_dfs = {}
        avg_nn_bl_dict = {}
        neighbor_summary_all = {}
        neighbor_counts_all = {}

        for species in target_species:
            results = []
            neighbor_summary = {}

            has_digit = any(char.isdigit() for char in species)

            for atom in df["Atom 1"].unique():
                # Exact match (Li1) or element-wide match (Li ‚Üí Li1, Li2, ...)
                if (has_digit and atom == species) or (not has_digit and re.match(rf"^{species}\d+$", atom)):
                    atom_bonds = df[(df["Atom 1"] == atom) | (df["Atom 2"] == atom)]
                    shortest = atom_bonds.sort_values("Distance (√Ö)").head(self.n_shortest)
                    results.append(shortest)

                    neighbors = []
                    for _, row in shortest.iterrows():
                        other_atom = row["Atom 2"] if row["Atom 1"] == atom else row["Atom 1"]
                        if not other_atom.startswith(species):
                            neighbors.append(other_atom)
                    neighbor_summary[atom] = neighbors

            if not results:
                print(f"‚ö†Ô∏è No bonds found for species {species}.")
                continue

            final_df = pd.concat(results).drop_duplicates().reset_index(drop=True)
            avg_nn_bl = final_df["Distance (√Ö)"].mean()
            avg_nn_bl_dict[species] = avg_nn_bl

            neighbor_counts = {}
            for atom, neigh_list in neighbor_summary.items():
                symbols = [''.join(filter(str.isalpha, x)) for x in neigh_list]
                neighbor_counts[atom] = dict(Counter(symbols))

            output_file = f"shortest_bonds_{species}.csv"
            final_df.to_csv(output_file, index=False)

            final_dfs[species] = final_df
            neighbor_summary_all[species] = neighbor_summary
            neighbor_counts_all[species] = neighbor_counts

        # ---- Improved Summary Output ----
        print("\nüîπ Average nearest-neighbor bond lengths:")
        grouped_summary = {}
        for sp in avg_nn_bl_dict:
            base_el = ''.join(filter(str.isalpha, sp))
            grouped_summary.setdefault(base_el, []).append((sp, avg_nn_bl_dict[sp]))

        for el, values in grouped_summary.items():
            print(f"\n  üß© Element: {el}")
            for sp, avg_val in values:
                print(f"     {sp:<8} ‚Üí {avg_val:.3f} √Ö")
                avg_val = "{:.3f}".format(avg_val)

        return final_dfs, avg_nn_bl_dict, neighbor_summary_all, neighbor_counts_all, avg_val
''''
# ----------------------------------------ORIGINAL---------------------------------------

In [4]:
# ========================
# Example usage
# ========================

'''cif_file = "./hubbard_structures_cifs/mp-6031.cif"
structure ="None"
analyzer = BondAnalyzer2(cif_file,structure, tolerance=0.4, n_shortest=4)
target_species = ["Li"]  # multiple entries possible
final_dfs, avg_nn_bl_dict, neighbor_summary_all, neighbor_counts_all, avg_val = analyzer.analyze(target_species)
#print(final_dfs)
#print(avg_nn_bl_dict)
#print(neighbor_summary_all)
#print(neighbor_counts_all)
print(avg_val)


#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
    for sp, df in final_dfs.items():
    print(f"\nüîπ Bonds for {sp}:\n", df)
    print("\nAverage NN bond lengths:", avg_nn_bl_dict)'''


'cif_file = "./hubbard_structures_cifs/mp-6031.cif"\nstructure ="None"\nanalyzer = BondAnalyzer2(cif_file,structure, tolerance=0.4, n_shortest=4)\ntarget_species = ["Li"]  # multiple entries possible\nfinal_dfs, avg_nn_bl_dict, neighbor_summary_all, neighbor_counts_all, avg_val = analyzer.analyze(target_species)\n#print(final_dfs)\n#print(avg_nn_bl_dict)\n#print(neighbor_summary_all)\n#print(neighbor_counts_all)\nprint(avg_val)\n\n\n#pd.set_option(\'display.max_columns\', None)\n#pd.set_option(\'display.max_rows\', None)\n    for sp, df in final_dfs.items():\n    print(f"\nüîπ Bonds for {sp}:\n", df)\n    print("\nAverage NN bond lengths:", avg_nn_bl_dict)'

In [35]:
from pymatgen.core import Structure
from mendeleev import element
import itertools
import pandas as pd
from collections import Counter
import re

class BondAnalyzer2:
    def __init__(self, cif_file=None, structure=None, tolerance=0.4, n_shortest=4):
        """
        Initialize the bond analyzer.
        Parameters:
            cif_file (str): Path to the CIF file.
            structure (Structure): Optional pymatgen Structure object.
            tolerance (float): Bond length tolerance.
            n_shortest (int): Number of shortest bonds to consider.
        """
        if cif_file is not None:
            try:
                self.cif_file = cif_file
                self.struct = Structure.from_file(cif_file)
            except Exception as e:
                print(f"‚ö†Ô∏è Reading CIF file failed: {e}")
        elif structure is not None:
            self.struct = structure
        else:
            raise ValueError("You must provide either a CIF file or a pymatgen Structure object.")

        self.tolerance = tolerance
        self.n_shortest = n_shortest

        # Metals list
        self.metals = [
            "Li", "Be", "Na", "Mg", "K", "Ca", "Al", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
            "Ag", "Au", "Ga", "In", "Sn", "Tl", "Pb", "Bi",
            "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
            "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr"
        ]

        # Assign atom labels and bonds
        self.atom_labels = self._assign_atom_labels()
        self.bonds_list = self._generate_bonds()

    def _assign_atom_labels(self):
        """Assign numbered labels like Ag1, Ag2, O1, etc."""
        element_counts = {}
        labels = []
        for site in self.struct:
            symbol = site.specie.symbol
            element_counts[symbol] = element_counts.get(symbol, 0) + 1
            labels.append(f"{symbol}{element_counts[symbol]}")
        return labels

    def _generate_bonds(self):
        """Generate all possible bonds based on distance and bonding radii."""
        bonds_list = []
        seen = set()

        for i, j in itertools.combinations(range(len(self.struct)), 2):
            site_i, site_j = self.struct[i], self.struct[j]
            label_i, label_j = self.atom_labels[i], self.atom_labels[j]
            d = site_i.distance(site_j)
            d_round = round(d, 3)

            el_i = element(site_i.specie.symbol)
            el_j = element(site_j.specie.symbol)

            r_i = el_i.metallic_radius if el_i.symbol in self.metals else el_i.covalent_radius
            r_j = el_j.metallic_radius if el_j.symbol in self.metals else el_j.covalent_radius
            if r_i is None or r_j is None:
                continue 
                
            if d <= (r_i + r_j + self.tolerance):
                # Determine bond type
                if el_i.symbol in self.metals and el_j.symbol in self.metals:
                    bond_type = "metallic"
                else:
                    en_i = el_i.electronegativity("pauling")
                    en_j = el_j.electronegativity("pauling")
                    if en_i is None or en_j is None:
                        bond_type = "unknown"
                    else:
                        delta_en = abs(en_i - en_j)
                        if delta_en < 0.4:
                            bond_type = "covalent"
                        elif delta_en < 1.7:
                            bond_type = "polar covalent"
                        else:
                            bond_type = "ionic"

                key = tuple(sorted([label_i, label_j]))
                if key in seen:
                    continue
                seen.add(key)

                bonds_list.append({
                    "Atom 1": label_i,
                    "Atom 2": label_j,
                    "Distance (√Ö)": d_round,
                    "Bond type": bond_type
                })
        return bonds_list

    def analyze(self, target_species):
        """
        Analyze shortest bonds for one or more target species.

        Behavior:
          - If a target species includes a digit (e.g. 'Li1'), it matches only that atom.
          - If it includes only letters (e.g. 'Li'), it matches all atoms of that element ('Li1', 'Li2', ...).
          - You can mix both (e.g., ['Li', 'Li1', 'Al2', 'Al']).
        """
        df = pd.DataFrame(self.bonds_list)
        if df.empty:
            print("‚ö†Ô∏è No bonds found. Check tolerance or CIF structure.")
            return {}, {}, {}, {}, {}

        final_dfs = {}
        avg_nn_bl_dict = {}
        neighbor_summary_all = {}
        neighbor_counts_all = {}

        for species in target_species:
            results = []
            neighbor_summary = {}
            has_digit = any(char.isdigit() for char in species)

            for atom in df["Atom 1"].unique():
                # Exact match (Li1) or element-wide match (Li ‚Üí Li1, Li2, ...)
                if (has_digit and atom == species) or (not has_digit and re.match(rf"^{species}\d+$", atom)):
                    atom_bonds = df[(df["Atom 1"] == atom) | (df["Atom 2"] == atom)]
                    shortest = atom_bonds.sort_values("Distance (√Ö)").head(self.n_shortest)
                    results.append(shortest)

                    neighbors = []
                    for _, row in shortest.iterrows():
                        other_atom = row["Atom 2"] if row["Atom 1"] == atom else row["Atom 1"]
                        if not other_atom.startswith(species):
                            neighbors.append(other_atom)
                    neighbor_summary[atom] = neighbors

            if not results:
                print(f"‚ö†Ô∏è No bonds found for species {species}.")
                continue

            final_df = pd.concat(results).drop_duplicates().reset_index(drop=True)
            avg_nn_bl = final_df["Distance (√Ö)"].mean()
            avg_nn_bl_dict[species] = avg_nn_bl

            neighbor_counts = {}
            for atom, neigh_list in neighbor_summary.items():
                symbols = [''.join(filter(str.isalpha, x)) for x in neigh_list]
                neighbor_counts[atom] = dict(Counter(symbols))

            # Save CSV for this species
            #output_file = f"shortest_bonds_{species}.csv"
            #final_df.to_csv(output_file, index=False)

            final_dfs[species] = final_df
            neighbor_summary_all[species] = neighbor_summary
            neighbor_counts_all[species] = neighbor_counts

        return final_dfs, avg_nn_bl_dict, neighbor_summary_all, neighbor_counts_all 


In [37]:
cif_file = "./hubbard_structures_cifs/mp-2940.cif"
structure ="None"
analyzer = BondAnalyzer2(cif_file,structure, tolerance=0.4, n_shortest=4)
target_species = ["P"]  # multiple entries possible
final_dfs, avg_nn_bl_dict, neighbor_summary_all, neighbor_counts_all = analyzer.analyze(target_species)
#print(final_dfs)
print(avg_nn_bl_dict)
#print(neighbor_summary_all)
#print(neighbor_counts_all)
#print(avg_val)


#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
for sp, df in final_dfs.items():
    print(f"\nüîπ Bonds for {sp}:\n", df)
    print("\nAverage NN bond lengths:", avg_nn_bl_dict)

{'P': np.float64(1.536)}

üîπ Bonds for P:
   Atom 1 Atom 2  Distance (√Ö)       Bond type
0     P1     O2         1.536  polar covalent
1     P1     O3         1.536  polar covalent
2     P1     O6         1.536  polar covalent
3     P1     O7         1.536  polar covalent
4     P2     O1         1.536  polar covalent
5     P2     O4         1.536  polar covalent
6     P2     O5         1.536  polar covalent
7     P2     O8         1.536  polar covalent

Average NN bond lengths: {'P': np.float64(1.536)}


In [21]:
from mendeleev import element
print(element("Lu").metallic_radius, element("Lu").covalent_radius)


None 162.0


In [None]:
import os
import pandas as pd
from tqdm import tqdm
from pymatgen.core import Structure

# Make sure BondAnalyzer2 is already imported or defined before this class

class NNAnalyzer:
    """
    Extracts nearest-neighbor features (e.g., average NN bond length)
    for each entry in a dataset of CIF files.

    Parameters:
        input_csv (str): Path to input CSV file containing 'identifier' and 'Species' columns.
        cif_folder (str): Folder containing CIF files (named <identifier>.cif).
        output_csv (str): Path to save the extracted features.
        tolerance (float): Bond tolerance for BondAnalyzer2.
        n_shortest (int): Number of shortest bonds to consider for NN.
    """

    def __init__(self, input_csv, cif_folder, output_csv,
                 tolerance=0.4, n_shortest=4):
        self.input_csv = input_csv
        self.cif_folder = cif_folder
        self.output_csv = output_csv
        self.tolerance = tolerance
        self.n_shortest = n_shortest

        # Load dataset
        try:
            self.df = pd.read_csv(input_csv)
        except Exception as e:
            raise FileNotFoundError(f"‚ùå Failed to read {input_csv}: {e}")

        # Feature list to accumulate results
        self.features_list = []

    def run(self):
        """Main execution loop: processes all structures in the dataset."""
        for idx, row in tqdm(self.df.iterrows(), total=len(self.df), desc="Nearest-neighbor feature extraction"):
            identifier = str(row["identifier"])
            species = row["Species"]
            cif_path = os.path.join(self.cif_folder, f"{identifier}.cif")

            # --- Check CIF existence ---
            if not os.path.exists(cif_path):
                print(f"‚ö†Ô∏è CIF not found for {identifier}, skipping...")
                continue

            # --- Load structure ---
            try:
                structure = Structure.from_file(cif_path)
            except Exception as e:
                print(f"‚ùå Failed to read CIF for {identifier}: {e}")
                continue

            # --- Run nearest-neighbor analysis ---
            try:
                analyzer = BondAnalyzer2(
                    cif_file=cif_path,
                    structure=structure,
                    tolerance=self.tolerance,
                    n_shortest=self.n_shortest
                )
                final_dfs, avg_nn_bl_dict, neighbor_summary_all, neighbor_counts_all, _ = analyzer.analyze([species])
                avg_nn_bl = avg_nn_bl_dict.get(species, None)
            except Exception as e:
                print(f"‚ö†Ô∏è NN analysis failed for {identifier}: {e}")
                continue

            # --- Store results ---
            self.features_list.append({
                "identifier": identifier,
                "Species": species,
                "avg_nn_dist": avg_nn_bl
            })

        # --- Save features to CSV ---
        features_df = pd.DataFrame(self.features_list)
        features_df.to_csv(self.output_csv, index=False)
        print(f"\n‚úÖ NN feature extraction completed. Results saved to ‚Üí {self.output_csv}")
