In [None]:
#import packages and set paths
import numpy as np
import pandas as pd
import traceback
import re
import dask
from dask import delayed
from dask.distributed import Client, progress, LocalCluster
import sys
import os
path_to_prn = os.path.abspath(
    '.../PoroNet')
sys.path.append(path_to_prn)
import poronet_functions as prn
from ase.io import read

In [None]:
# read total adsorption data collected from GCMC simulation results (we have removed the extrme outliers tobmof-4092 and tobmof-5740)
tot_ads = np.loadtxt('160K_5bar_cccc_new.txt', dtype='str')

# convert adsorption units to g/L 
tot_ads[:,1] = tot_ads[:,1].astype(np.float64)/22.4139757476*2.01588
tot_ads[:,2] = tot_ads[:,2].astype(np.float64)/22.4139757476*2.01588

In [None]:
#Randomly shuffle the tot_ads
np.random.seed(14)
np.random.shuffle(tot_ads)

In [None]:
#Remove the ones that cannot be generated pore graphs
tot_ads= np.delete(tot_ads, [648, 1557, 1670, 1685], axis=0)

In [None]:
# Remove MOFs that ran out of memory during label extraction at 77 K / 100 bar
tot_ads= np.delete(tot_ads, [855, 1277, 1652, 1807], axis=0)

Extract pore-level labels from GCMC trajectories according to indices in tot_ads

In [None]:
def pore_level_label_extraction(i):  
    pore_labels=[]
    mofid = tot_ads[i,0]
    #define the path of gcmc folders
    path_to_160K_5bar='.../gcmc/160K_5bar'
    path_to_output= path_to_160K_5bar + '/' + mofid + '/' + 'Output/System_0'
    path_to_cif = path_to_160K_5bar + '/' + mofid + '/' + mofid + '.cif'
    path_to_movie=path_to_160K_5bar+ '/' + mofid + '/' + '/Movies/System_0'
    
    all_files_output = os.listdir(path_to_output)
    for file_name in all_files_output:
        if file_name.startswith(f'output_') and file_name.endswith('.data'):
            path_to_data = path_to_output + '/' + file_name
            with open(path_to_data, 'r') as file:
             content = file.read()
             match_a = re.search(r"Number of unitcells \[a\]:\s*(\d+)", content)
             number_of_unitcells_a = match_a.group(1)
             match_b = re.search(r"Number of unitcells \[b\]:\s*(\d+)", content)
             number_of_unitcells_b = match_b.group(1)
             match_c = re.search(r"Number of unitcells \[c\]:\s*(\d+)", content)
             number_of_unitcells_c = match_c.group(1)
             number_of_unitcells = int(number_of_unitcells_a) * int(number_of_unitcells_b) * int(number_of_unitcells_c) 
            break 

    #list all files in Movie/System_0
    all_files_movie = os.listdir(path_to_movie)
    
    found = False
    #find the movie file
    for file_name in all_files_movie:
        if file_name.startswith(f'Movie_{mofid}_') and file_name.endswith('_160.000000_500000.000000_allcomponents.pdb'):
            path_to_pdb = path_to_movie + '/' + file_name
            found = True
            break
    if not found: 
          raise ValueError

    # read cif
    mat_atoms = read(path_to_cif)

    # compute distance grid
    dgrid = prn.dgrid_from_atoms_cpu_no_aabb(mat_atoms, spacing=0.5)
    
    # compute regions and maxima
    regions, maxima = prn.regions_from_dgrid_with_threshold_abs(dgrid, mask_thickness=0, h=0.5, threshold_abs=1)
    maxima_radii = dgrid[tuple(maxima.T)]
    
    # apply pbc
    regions_pbc = prn.apply_pbc(regions, maxima, maxima_radii, mat_atoms )

    # output A dictionary with the number of atoms in each cluster type and the total number of atoms.
    pore_dic = prn.apply_region_map_to_raspa_pdb_wrapped(path_to_pdb, regions_pbc, mat_atoms, symbol_map=dict(Hc='H'))

    for key, value in pore_dic.items():

        if '_mean' in key and 'total_mean' not in key:
           value_unit_pore = value / number_of_unitcells
           pore_labels.append(value_unit_pore)
    return pore_labels
    

In [None]:
#delayed task
delayed_pore_labels_future=[]
for i in range(len(tot_ads)):
  pore_label=delayed(pore_level_label_extraction)(i)
  delayed_pore_labels_future.append(pore_label)

In [None]:
# Create a Dask client with 64 CPUs for parallel computation
client = Client(n_workers=64)

In [None]:
client

In [None]:
#Start the delayed tasks 
futures = dask.persist(*delayed_pore_labels_future) 

In [None]:
#Check the progress
progress(futures)

In [None]:
#Stop the task
client.shutdown()

In [None]:
#Collecting results from the completed delayed task
results = client.compute(futures)
pore_labels = [r.result() if r.status == 'finished' else None for r in results]

In [None]:
#Make pore_labels a 1d list
pore_labels_1d=[]
pore_labels_1d = [item for sublist in pore_labels for item in sublist]

In [None]:
#Store the results as a excel
df_pore= pd.DataFrame(pore_labels_1d, columns=['160K_5bar-pore-molecule'])
df_pore.to_excel('160K_5bar-pore-molecule.xlsx', index=False)  