Feature extraction (altogether)

1. Metal feature - Find metal type from MOF.key, match with elemental_descriptors excel file, and make vectors. If there are several metals, connect vectors.
2. Linker feature - Find linker SMILES from MOF.key, run RDKit, and get RDKit descriptor, Morgan fp. If there are several linkers, connect vectors.
3. Global feature - Using pymatgen, parse the .cif files and get features using Matminer.

In [None]:
import csv
import json
from pathlib import Path
import os

from pymatgen.io.cif import CifParser
from pymatgen.transformations.standard_transformations import SupercellTransformation

from matminer.featurizers.composition import Meredig
from matminer.featurizers.conversions import StrToComposition
import pandas as pd

from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, DataStructs
import numpy as np
from tqdm import tqdm

In [None]:
csv_file_path = './elemental_descriptors.csv'

element_dict = {}

with open(csv_file_path, 'r') as file:
    csv_reader = csv.reader(file)
    
    next(csv_reader)
    
    for row in csv_reader:
        key = row[1]
        values = [float(value) if value.replace('.', '', 1).isdigit() else value for value in row[2:]]
        
        element_dict[key] = values


In [None]:
# RDkit 2d descriptors (210)
def get_desc_2D(mols): 
    descriptors_list = [x[0] for x in Descriptors._descList]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(descriptors_list)
    
    descs = []

    for i in tqdm(range(len(mols)), disable=True):
        ds = calc.CalcDescriptors(mols[i])
        descs.append(ds)

    return np.array(descs)

    
# Atom Level - Binary fingerprint (Morgan)
def get_fp(mols, radius = 3, nBits = 1024):
    fp_list = []
    
    for i in tqdm(range(len(mols)), disable=True):
        fp = AllChem.GetMorganFingerprintAsBitVect(mols[i], radius = radius, nBits=nBits )
        temp_arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, temp_arr)
        fp_list.append(temp_arr)

    return np.array(fp_list)

In [None]:
with open('./descriptor_input.json','r') as r:
    descriptor_input = json.load(r)

input_dict = {}
for item in descriptor_input:
    input_dict[item['refcode']] = item

In [None]:
cif_files = list(Path('').glob("*.cif"))

x_vector = []
refcode_list = []

for cif_file_path in cif_files[:]:
    vector = []
    compositions = []

    filename = os.path.basename(cif_file_path)
    refcode = filename.split('.')[0]
    # print(refcode)

    if refcode not in input_dict.keys():
        continue

    metals = input_dict[refcode]['metals']
    for i in range(3 - len(metals)):
        metals.append("None")

    for metal in metals:   
        vector.extend(element_dict[metal])

    structure = CifParser(str(cif_file_path)).parse_structures(primitive=False)[0]
    supercell_transform = SupercellTransformation([[1,0,0],[0,2,0],[0,0,4]])
    new_structure = supercell_transform.apply_transformation(structure)
    composition = new_structure.composition
    compositions.append(composition.formula)


    # Convert to a DataFrame
    df = pd.DataFrame(compositions, columns=['composition'])

    # Convert string to Composition object, specifying a new column for the output
    df = StrToComposition(target_col_id='composition_obj').featurize_dataframe(df, 'composition', pbar=False)
    
    # Initialize the Meredig featurizer
    meredig_feat = Meredig()

    # Apply the featurizer using the new composition object column
    df = meredig_feat.featurize_dataframe(df, col_id='composition_obj', pbar=False)

    # print(df)
    
    # Extract values from the third column to the last column
    values_array = df.iloc[:, 2:].values

    # Flatten the array and convert it to a list
    Meredig_feature = values_array.flatten().tolist()
    vector.extend(Meredig_feature)

    smiles = input_dict[refcode]['linkers']
    zero_num = (3 - len(smiles))

    
    for smile in smiles:
        mol = Chem.MolFromSmiles(smile)

        # Pass a list of molecule objects
        rdkit_feature = get_desc_2D(mols=[mol])[0]
        Morgan_fp_feature = get_fp(mols=[mol])[0]

        vector.extend(rdkit_feature)
        vector.extend(Morgan_fp_feature)

    for i in range(zero_num):
        none_ = [0]*1234
        vector.extend(none_)
    
    refcode_list.append(refcode)
    x_vector.append(vector)


In [None]:
with open('x_vector.json', 'w') as file:
    json.dump(x_vector, file)

with open('refcode_list.json', 'w') as file:
    json.dump(refcode_list, file)