In [1]:
import os
from tqdm import tqdm
import numpy as np
from collections import Counter
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from mofgraph2vec.utils.saving import save_embedding

In [2]:
def smi_tokenizer(smi):
    """
    Tokenize a SMILES molecule or reaction
    """
    import re
    pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    assert smi == ''.join(tokens)
    return ' '.join(tokens)

In [13]:
cif_path = ["../data/bbs", "../data/bbs_qmof/"]

ori_mofids = []
cif_names = []
for path in cif_path:
    cif_list = os.listdir(path)
    for c in tqdm(cif_list):
        with open(os.path.join(path, c, "python_mofid.txt")) as file:
            ori_mofids.append(file.readline().split()[0])
            cif_names.append(c.rstrip(".cif"))

100%|██████████| 3254/3254 [00:00<00:00, 3368.94it/s]
100%|██████████| 20375/20375 [00:07<00:00, 2652.88it/s]


In [14]:
tok_mofids = [smi_tokenizer(id).split() for id in ori_mofids]

In [15]:
len(tok_mofids), len(cif_names)

(23629, 23629)

In [19]:
documents = []
for i, tok in enumerate(tok_mofids):
    documents.append(TaggedDocument(words=tok, tags=[cif_names[i].rstrip(".cif")]))

In [26]:
model = Doc2Vec(
    documents, vector_size=500, window=50, min_count=0, workers=4, alpha=3e-2
)

model.build_vocab(documents)


In [27]:
model.train(documents, total_examples=model.corpus_count, epochs=150)

In [28]:
save_embedding(
    True,
    "./", 
    model, 
    documents, 
    500, 
    None,
    None
)

2023-06-09 11:58:41.777 | DEBUG    | mofgraph2vec.utils.saving:save_embedding:50 - Pass to DataFrame
2023-06-09 11:58:44.830 | DEBUG    | mofgraph2vec.utils.saving:save_embedding:52 - Sort values
2023-06-09 11:58:44.867 | DEBUG    | mofgraph2vec.utils.saving:save_embedding:54 - Save to csv
2023-06-09 11:58:52.844 | DEBUG    | mofgraph2vec.utils.saving:save_embedding:56 - Finished
