In [12]:
import ast
import itertools
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import rdkit.Chem.AllChem as Chem
from rdkit.Chem import AllChem
import matplotlib.pyplot as plt

In [2]:
# cleanup
df_info = pd.read_csv('data/hMOF_CO2_info.csv')
df_info = df_info.dropna() # drop entries containing 'NaN'
df_info = df_info[df_info.CO2_wc_001>0] # only keep entries with positive CO2 working capacity
df_info = df_info[~df_info.MOFid.str.contains('ERROR')] # drop entries with error
df_info = df_info[~df_info.MOFid.str.contains('NA')] # drop entries with NA

In [6]:
# get node and linker information
metal_eles = ['Zn', 'Cu', 'Mn', 'Zr', 'Co', 'Ni', 'Fe', 'Cd', 'Pb', 'Al', 'Mg', 'V',
       'Tb', 'Eu', 'Sm', 'Tm', 'Gd', 'Nd', 'Dy', 'La', 'Ba', 'Ga', 'In',
       'Ti', 'Be', 'Ce', 'Li', 'Pd', 'Na', 'Er', 'Ho', 'Yb', 'Ag', 'Pr',
       'Cs', 'Mo', 'Lu', 'Ca', 'Pt', 'Ge', 'Sc', 'Hf', 'Cr', 'Bi', 'Rh',
       'Sn', 'Ir', 'Nb', 'Ru', 'Th', 'As', 'Sr']

# get a list of metal nodes & create a new column named "metal_nodes"
metal_nodes = []
organic_linkers = []
for i,mofid in tqdm(enumerate(df_info.MOFid)):
    sbus = mofid.split()[0].split('.')
    metal_nodes.append([c for c in sbus if any(e in c for e in metal_eles)][0])
    organic_linkers.append([c for c in sbus if any(e in c for e in metal_eles)==False])

df_info['metal_node'] = metal_nodes
df_info['organic_linker'] = organic_linkers

107449it [00:02, 39452.56it/s]


In [7]:
unique_nodes = [n for n in list(df_info['metal_node'].unique()) if len(n)<=30] # node smiles should be shorter then 30 strings
df_info = df_info[df_info['metal_node'].isin(unique_nodes)] # filter df_info based on unique_nodes
freq = [df_info['metal_node'].value_counts()[value] for value in list(df_info.metal_node.unique())] # get frequency of unique nodes
df_freq = pd.DataFrame({'node':list(df_info.metal_node.unique()),'freq':freq})
print('most frequently occuring node:')
print(df_freq)
unique_node_select = ['[Zn][Zn]', '[Cu][Cu]', '[Zn][O]([Zn])([Zn])[Zn]', '[V]'] # manually select the most occuring nodes
df_info_select = df_info[df_info['metal_node'].isin(unique_node_select)] # select df_info with node only in list(unique_node_select)
# output df for each node to a separate csv files
for n in unique_node_select:
    df_info_select_node = df_info[df_info.metal_node == n]
    df_info_select_node.to_csv(f'data/data_by_node/{n}.csv',index=False)

most frequently occuring node:
                        node   freq
0                   [Zn][Zn]  28529
1                   [Cu][Cu]  29714
2    [Zn][O]([Zn])([Zn])[Zn]  43874
3                        [V]   5197
4                 [O][V]O[V]     46
5                  [V]1[V]O1     50
6                    [V]O[V]     32
7            [V]O[V]O[V]O[V]      1
8                       [Zn]      2
9          [V]1O[V][V]O[V]O1      1
10  F[Zn][O]([Zn])([Zn])[Zn]      1
11                 [Cu][CuH]      1
12         [Zr][O]([Zr])[Zr]      1


In [13]:
# load data
for node in unique_node_select:
    input_data_path = f'data/data_by_node/{node}.csv' 
    output_data_path = f'data/data_high_wc/{node}.csv'

    df = pd.read_csv(input_data_path)

    # select entries with high working capactiy
    df_high_wc = df[df['CO2_wc_01'] >=2]

    # select entries with three parsed linker
    len_linkers = [len(ast.literal_eval(df_high_wc['organic_linker'].iloc[i])) for i in range(len(df_high_wc['organic_linkers']))]
    df_high_wc['len_linkers'] = len_linkers
    df_high_wc_select = df_high_wc[df_high_wc.len_linkers==3]
    df_high_wc_select.to_csv(output_data_path,index=False)

    # get list of smiles strings for all linkers

    list_smiles = [ast.literal_eval(i) for i in df_high_wc_select['organic_linkers']]
    all_smiles = list(itertools.chain(*list_smiles))
    print(len(all_smiles))

    # output to sdf
    conformer_sdf_path = f'data/conformers_{node}.sdf'

    writer = Chem.SDWriter(conformer_sdf_path)
    for smile in tqdm(all_smiles):
        try:
            mol = Chem.AddHs(Chem.MolFromSmiles(smile))
            conformers = AllChem.EmbedMultipleConfs(mol, numConfs=1)
            conformer = mol.GetConformer(0)
            for cid in range(mol.GetNumConformers()):
                writer.write(mol, confId=cid)
        except:
            pass

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_high_wc['len_linkers'] = len_linkers


2502


  0%|          | 0/2502 [00:00<?, ?it/s][22:02:34] Explicit valence for atom # 3 N, 4, is greater than permitted
  1%|          | 27/2502 [00:00<01:00, 40.88it/s] [22:02:35] Explicit valence for atom # 1 N, 4, is greater than permitted
[22:02:35] Explicit valence for atom # 15 N, 4, is greater than permitted
  5%|▍         | 116/2502 [00:01<00:29, 81.36it/s][22:02:36] Explicit valence for atom # 1 N, 4, is greater than permitted
  7%|▋         | 164/2502 [00:02<00:22, 104.65it/s][22:02:36] Explicit valence for atom # 1 N, 4, is greater than permitted
  7%|▋         | 175/2502 [00:02<00:23, 97.74it/s] [22:02:36] Explicit valence for atom # 10 N, 4, is greater than permitted
 12%|█▏        | 290/2502 [00:03<00:18, 118.84it/s][22:02:37] Explicit valence for atom # 4 N, 4, is greater than permitted
 13%|█▎        | 328/2502 [00:03<00:19, 112.96it/s][22:02:38] Explicit valence for atom # 2 N, 4, is greater than permitted
[22:02:38] Explicit valence for atom # 3 N, 4, is greater than permitt

3048


  0%|          | 0/3048 [00:00<?, ?it/s][22:03:03] Explicit valence for atom # 3 N, 4, is greater than permitted
  4%|▍         | 122/3048 [00:01<00:29, 99.49it/s] [22:03:04] Explicit valence for atom # 5 N, 4, is greater than permitted
  5%|▌         | 167/3048 [00:01<00:26, 108.51it/s][22:03:05] Explicit valence for atom # 11 N, 4, is greater than permitted
 11%|█         | 325/3048 [00:03<00:28, 95.79it/s] [22:03:06] Explicit valence for atom # 14 N, 4, is greater than permitted
 16%|█▌        | 490/3048 [00:04<00:24, 106.13it/s][22:03:08] Explicit valence for atom # 13 N, 4, is greater than permitted
 19%|█▉        | 579/3048 [00:05<00:24, 100.75it/s][22:03:09] Explicit valence for atom # 5 N, 4, is greater than permitted
 24%|██▎       | 717/3048 [00:07<00:25, 90.12it/s] [22:03:11] Explicit valence for atom # 3 N, 4, is greater than permitted
 25%|██▍       | 755/3048 [00:08<00:26, 85.40it/s][22:03:12] Explicit valence for atom # 17 N, 4, is greater than permitted
 29%|██▊       |

4164


  3%|▎         | 118/4164 [00:00<00:30, 133.97it/s][22:03:39] Explicit valence for atom # 8 N, 4, is greater than permitted
  9%|▊         | 355/4164 [00:02<00:33, 112.69it/s][22:03:41] Explicit valence for atom # 9 N, 4, is greater than permitted
  9%|▉         | 388/4164 [00:02<00:28, 133.66it/s][22:03:41] Explicit valence for atom # 10 N, 4, is greater than permitted
 34%|███▎      | 1400/4164 [00:09<00:14, 188.49it/s][22:03:48] Explicit valence for atom # 11 N, 4, is greater than permitted
 37%|███▋      | 1555/4164 [00:10<00:14, 178.14it/s][22:03:49] Explicit valence for atom # 14 N, 4, is greater than permitted
[22:03:49] Explicit valence for atom # 14 N, 4, is greater than permitted
 38%|███▊      | 1580/4164 [00:11<00:13, 193.26it/s][22:03:49] Explicit valence for atom # 6 C, 5, is greater than permitted
 63%|██████▎   | 2617/4164 [00:18<00:10, 151.93it/s][22:03:57] Explicit valence for atom # 11 N, 4, is greater than permitted
 73%|███████▎  | 3046/4164 [00:45<00:13, 84.09it/s

1911


  1%|          | 10/1911 [00:00<00:29, 65.00it/s][22:04:41] Explicit valence for atom # 19 N, 4, is greater than permitted
  2%|▏         | 46/1911 [00:00<00:14, 127.37it/s][22:04:41] Explicit valence for atom # 20 N, 4, is greater than permitted
  6%|▌         | 109/1911 [00:00<00:13, 136.58it/s][22:04:41] Explicit valence for atom # 1 N, 4, is greater than permitted
 12%|█▏        | 232/1911 [00:01<00:13, 120.85it/s][22:04:42] Explicit valence for atom # 32 N, 4, is greater than permitted
 14%|█▍        | 275/1911 [00:02<00:22, 71.91it/s] [22:04:43] Explicit valence for atom # 21 N, 4, is greater than permitted
[22:04:43] Explicit valence for atom # 11 N, 4, is greater than permitted
[22:04:43] Explicit valence for atom # 32 N, 4, is greater than permitted
 29%|██▉       | 550/1911 [00:05<00:19, 70.55it/s] [22:04:46] Explicit valence for atom # 8 N, 4, is greater than permitted
 30%|███       | 577/1911 [00:05<00:16, 78.77it/s][22:04:46] Explicit valence for atom # 6 N, 4, is greater