In [1]:
# Prepare data root (Output folder)
import os

# For import current library
import sys
sys.path.insert(0, "../")

expr_root = os.path.expanduser("~/expr/unsup-seq2seq/")
data_root = os.path.join(expr_root, "data")

if not os.path.exists(data_root):
    os.makedirs(data_root)

In [2]:
# Train/test data iter.

train_data_path = "/smile/nfs/projects/nih_drug/data/pm2/pm2.smi"
test_data_path = "/smile/nfs/projects/nih_drug/data/logp/logp.smi"

def smi_data_iter(smi_path):
    """Yield logp SMILE representation."""
    with open(smi_path) as fobj:
        for line in fobj:
            if not len(line.strip()):
                continue
            _smile = line.strip().split()[0]
            yield _smile

In [3]:
# Write all smiles to files.

def build_data_tmp(data_iter, data_path):
    """Build temp data file inside the data_directory. This is required for tensorflow function."""
    with open(data_path, "w+") as fobj:
        for _smile in data_iter:
            fobj.write("%s\n" % _smile)
            
train_smile_path = os.path.join(data_root, "pm2.smiles")
test_smile_path = os.path.join(data_root, "logp.smiles")
 
build_data_tmp(smi_data_iter(train_data_path), train_smile_path)
build_data_tmp(smi_data_iter(test_data_path), test_smile_path)

In [5]:
# Build vocabulary from train_data


from unsupervised.utils import true_smile_tokenizer, get_vocabulary

vocab_path = os.path.join(data_root, "pm2.vocab")

get_vocabulary(train_smile_path, vocab_path, tokenizer=true_smile_tokenizer)

({'#': 27,
  '%': 37,
  '(': 7,
  ')': 6,
  '+': 22,
  '-': 24,
  '.': 30,
  '/': 25,
  '0': 39,
  '1': 10,
  '2': 11,
  '3': 12,
  '4': 14,
  '5': 21,
  '6': 28,
  '7': 32,
  '8': 34,
  '9': 36,
  '=': 5,
  '@': 20,
  'Br': 26,
  'C': 4,
  'Cl': 19,
  'F': 18,
  'H': 23,
  'I': 35,
  'N': 9,
  'O': 8,
  'P': 31,
  'S': 13,
  '[': 15,
  '\\': 17,
  ']': 16,
  '_EOS': 2,
  '_GO': 1,
  '_PAD': 0,
  '_UNK': 3,
  'c': 29,
  'n': 33,
  'o': 40,
  's': 38},
 ['_PAD',
  '_GO',
  '_EOS',
  '_UNK',
  'C',
  '=',
  ')',
  '(',
  'O',
  'N',
  '1',
  '2',
  '3',
  'S',
  '4',
  '[',
  ']',
  '\\',
  'F',
  'Cl',
  '@',
  '5',
  '+',
  'H',
  '-',
  '/',
  'Br',
  '#',
  '6',
  'c',
  '.',
  'P',
  '7',
  'n',
  '8',
  'I',
  '9',
  '%',
  's',
  '0',
  'o'])

In [6]:
from unsupervised.utils import true_smile_tokenizer, data_to_token_ids

train_token_path = os.path.join(data_root, "pm2.tokens")
test_token_path = os.path.join(data_root, "logp.tokens")

data_to_token_ids(train_smile_path, train_token_path, vocab_path, true_smile_tokenizer)
data_to_token_ids(test_smile_path, test_token_path, vocab_path, true_smile_tokenizer)

Tokenizing data in /home/zhengxu/expr/unsup-seq2seq/data/pm2.smiles
  tokenizing line 100000
  tokenizing line 200000
  tokenizing line 300000
Tokenizing data in /home/zhengxu/expr/unsup-seq2seq/data/logp.smiles
