<a href="https://colab.research.google.com/github/RMaarefdoust/Crystal-Structure-Prediction/blob/main/Preprocess_cif_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pymatgen

Collecting pymatgen
  Downloading pymatgen-2024.7.18-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting matplotlib>=3.8 (from pymatgen)
  Downloading matplotlib-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting monty>=2024.5.24 (from pymatgen)
  Downloading monty-2024.7.12-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting palettable>=3.1.1 (from pymatgen)
  Downloading palettable-3.3.3-py2.py3-none-any.whl (332 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m332.3/332.3 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
Collecting pybtex>=0.24.0 (from pymatge

In [13]:
import os
import json
from pymatgen.io.cif import CifParser

def extract_structure_data(cif_path):
    """
    Extracts structural data from a CIF file.
    """
    parser = CifParser(cif_path)
    structure = parser.get_structures()[0]  # Assuming there's only one structure in each CIF file

    data = {
        "_symmetry_space_group_name_H-M": structure.get_space_group_info(),
        "_cell_length_a": structure.lattice.a,
        "_cell_length_b": structure.lattice.b,
        "_cell_length_c": structure.lattice.c,
        "_cell_angle_alpha": structure.lattice.alpha,
        "_cell_angle_beta": structure.lattice.beta,
        "_cell_angle_gamma": structure.lattice.gamma,
        "_chemical_formula_structural": structure.formula,
        "_chemical_formula_sum": structure.composition.reduced_formula,
        "_cell_volume": structure.volume,
        "_cell_formula_units_Z": structure.composition.num_atoms,
    }

    return data

def extract_atom_site_data(cif_path):
    """
    Extracts atom site data from a CIF file.
    """
    atom_site_data = {
        "atom_site_type_symbol": [],
        "atom_site_label": [],
        "atom_site_symmetry_multiplicity": [],
        "atom_site_fract_x": [],
        "atom_site_fract_y": [],
        "atom_site_fract_z": [],
        "atom_site_occupancy": []
    }

    with open(cif_path, 'r') as cif_file:
        found_atom_site_data = False
        for line in cif_file:
            if found_atom_site_data:
                values = line.strip().split()
                if len(values) == 7:  # Ensure line contains the expected number of elements
                    atom_site_data["atom_site_type_symbol"].append(values[0])
                    atom_site_data["atom_site_label"].append(values[1])
                    atom_site_data["atom_site_symmetry_multiplicity"].append(values[2])
                    atom_site_data["atom_site_fract_x"].append(values[3])
                    atom_site_data["atom_site_fract_y"].append(values[4])
                    atom_site_data["atom_site_fract_z"].append(values[5])
                    atom_site_data["atom_site_occupancy"].append(values[6])
            elif line.strip().startswith("loop_"):
                found_atom_site_data = True

    return atom_site_data

def save_combined_data_to_json(cif_file, structure_data, atom_site_data, output_folder):
    """
    Saves combined structural and atom site data to a JSON file in the specified output folder.
    """
    output_json_path = os.path.join(output_folder, os.path.splitext(cif_file)[0] + "_combined_data.json")

    with open(output_json_path, 'w') as jsonfile:
        json.dump({
            "structure_data": structure_data,
            "atom_site_data": atom_site_data
        }, jsonfile, indent=4)

    print(f"Combined data saved to {output_json_path}")

def process_cif_files(folder_path, output_folder):
    """
    Processes CIF files in a folder and saves the combined data to JSON files in the specified output folder.
    """
    # List all CIF files in the folder
    cif_files = [f for f in os.listdir(folder_path) if f.endswith('.cif')]

    # Process each CIF file
    for cif_file in cif_files:
        cif_path = os.path.join(folder_path, cif_file)

        # Extract structural data
        structure_data = extract_structure_data(cif_path)

        # Extract atom site data
        atom_site_data = extract_atom_site_data(cif_path)

        # Save combined data to JSON
        save_combined_data_to_json(cif_file, structure_data, atom_site_data, output_folder)

    print(f"JSON files created for {len(cif_files)} CIF files.")

# Path to the folder containing CIF files
folder_path = "cif"

# Output folder path
output_folder = "cif3"

# Process CIF files and save output in the new folder
process_cif_files(folder_path, output_folder)

The only difference is that primitive defaults to False in the new parse_structures method.So parse_structures(primitive=True) is equivalent to the old behavior of get_structures().
  structure = parser.get_structures()[0]  # Assuming there's only one structure in each CIF file


Combined data saved to cif3/mp-1219097_combined_data.json
Combined data saved to cif3/mp-1219088_combined_data.json
Combined data saved to cif3/mp-1219066_combined_data.json
Combined data saved to cif3/mp-1219094_combined_data.json
Combined data saved to cif3/mp-1219078_combined_data.json
Combined data saved to cif3/mp-1219065_combined_data.json
Combined data saved to cif3/mp-1219058_combined_data.json
Combined data saved to cif3/mp-1219096_combined_data.json
Combined data saved to cif3/mp-1219095_combined_data.json
Combined data saved to cif3/mp-1219080_combined_data.json
Combined data saved to cif3/mp-1219076_combined_data.json
Combined data saved to cif3/mp-1219063_combined_data.json
Combined data saved to cif3/mp-1219067_combined_data.json
Combined data saved to cif3/mp-1219082_combined_data.json
Combined data saved to cif3/mp-1219079_combined_data.json
Combined data saved to cif3/mp-1219056_combined_data.json
Combined data saved to cif3/mp-1219059_combined_data.json
Combined data 

In [14]:
import os
import json
import pandas as pd

def process_json_files(json_dir, output_csv):
    # Initialize empty lists to store data
    structure_data_list = []
    atom_site_data_list = []

    # Iterate over each JSON file in the directory
    for filename in os.listdir(json_dir):
        if filename.endswith('.json'):
            with open(os.path.join(json_dir, filename), 'r') as f:
                data = json.load(f)
                structure_data_list.append(data['structure_data'])
                atom_site_data_list.append(data['atom_site_data'])

    # Convert lists of dictionaries to DataFrames
    structure_df = pd.DataFrame(structure_data_list)
    atom_site_df = pd.DataFrame(atom_site_data_list)

    # Write DataFrames to CSV
    structure_df.to_csv(output_csv + "_structure.csv", index=False)
    atom_site_df.to_csv(output_csv + "_atom_site.csv", index=False)

if __name__ == "__main__":
    # Directory containing JSON files
    json_directory = "cif3"

    # Output CSV file name (without extension)
    output_csv_file = "output"

    # Process JSON files and create CSV
    process_json_files(json_directory, output_csv_file)
