In [None]:
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.contrib.layers.python.layers import layers

from collections import Counter
from sklearn.manifold import TSNE
from utils import make_dir, read_or_create

## Read data

In [None]:
def make_codones(sseq):
    crop = len(sseq) % 3
    cropped_seq = sseq[:-crop] if crop > 0 else sseq

    return [cropped_seq[i:i+3] for i in range(0, len(cropped_seq), 3)]

def seq_to3(seq):
    return [make_codones(seq[i:]) for i in range(3)]

def create_all_codones(df):
    codones = []

    for i in range(df.shape[0]):
        row = df.iloc[i, :][0]
        codones.extend(seq_to3(row))
        
    return codones

In [None]:
seq_df = pd.read_table("data/family_classification_sequences.tab")
seq_df.head()

In [None]:
all_codones = read_or_create(read_path="data/all_codones.pickle",
                             producer=lambda: create_all_codones(seq_df))

## Process data

In [None]:
def generate_sample(index_words_list, context_window_size):
    """ Form training pairs according to the skip-gram model """
    
    for index_words in index_words_list:
        for index, center in enumerate(index_words):
            context = random.randint(1, context_window_size)
 
            # get a random target before the center word
            for target in index_words[max(0, index - context): index]:
                yield center, target
                
            # get a random target after the center wrod
            for target in index_words[index + 1: index + context + 1]:
                yield center, target

def get_batch(iterator, batch_size):
    """ Group a numerical stream into batches and yield them as Numpy arrays """
    
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1], dtype=np.int32)
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(iterator)
        yield center_batch, target_batch

def flatten(x):
    return [item for sublist in x for item in sublist]

def cod_to_dict(cod, dictionary):
    return [dictionary[key] for key in cod]

def make_dictionary(all_codones):
    counter = Counter(flatten(all_codones))
    ordered = map(lambda it: it[0], counter.most_common())
    dictionary = {cod: i for i, cod in enumerate(ordered)}
    return dictionary

def process_data(all_codones, dictionary, batch_size, skip_window):
    cod_dicts = (cod_to_dict(cod, dictionary) for cod in all_codones)
    single_gen = generate_sample(cod_dicts, context_window_size=skip_window)
    return get_batch(single_gen, batch_size=batch_size)

In [None]:
dictionary = make_dictionary(all_codones)

In [None]:
BATCH_SIZE = 128
SKIP_WINDOW = 12  # the context window

batch_gen = process_data(all_codones, dictionary, BATCH_SIZE, SKIP_WINDOW)

## SkipGramModel

In [None]:
from lazy import lazy

class SkipGramModel:
    """ Build the graph for word2vec model """

    def __init__(self, vocab_size, embed_size, batch_size, num_sampled, learning_rate):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.batch_size = batch_size
        self.num_sampled = num_sampled
        self.lr = learning_rate
        self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name="global_step")
        self.saver = tf.train.Saver()

    @lazy
    def placeholders(self):
        with tf.name_scope("data"):
            center_words = tf.placeholder(tf.int32, shape=[self.batch_size], name="center_words")
            target_words = tf.placeholder(tf.int32, shape=[self.batch_size, 1], name="target_words")
            return center_words, target_words
    
    @lazy
    def embedding(self):
        with tf.name_scope("embed"):
            return tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size], 0, 2), name="embed_matrix")
    
    @lazy
    def loss(self):
        center_words, target_words = self.placeholders
            
        with tf.name_scope("loss"):
            embed = tf.nn.embedding_lookup(self.embedding, center_words, name="embed")
            
            # construct variables for out hidden layer
            layer = layers.fully_connected(inputs=embed,
                                           num_outputs=self.embed_size,
                                           weights_initializer=tf.truncated_normal_initializer(mean=0, 
                                                                                               stddev=1.0 / (self.embed_size ** 0.5)),
                                           biases_initializer=tf.constant_initializer(value=0.1))
            
            # construct variables for NCE loss
            nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size],
                                                         stddev=1.0 / (self.embed_size ** 0.5)),
                                     name="nce_weight")
            nce_bias = tf.Variable(tf.zeros([self.vocab_size]), name="nce_bias")

            # define loss function to be NCE loss function
            return tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight,
                                                 biases=nce_bias,
                                                 labels=target_words,
                                                 inputs=layer,
                                                 num_sampled=self.num_sampled,
                                                 num_classes=self.vocab_size), name="loss")
    
    @lazy
    def optimizer(self):
        return tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss, global_step=self.global_step)

    @lazy
    def summary(self):
        with tf.name_scope("summaries"):
            tf.summary.scalar("loss", self.loss)
            tf.summary.histogram("histogram loss", self.loss)
            return tf.summary.merge_all()
        
    def build(self):
        self.optimizer
        self.summary

## Train model

In [None]:
VOCAB_SIZE = 9424
EMBED_SIZE = 100 # dimension of the word embedding vectors
NUM_SAMPLED = 5  # Number of negative examples to sample.
LEARNING_RATE = 0.9
NUM_TRAIN_STEPS = 200000
SKIP_STEP = 2000

graph = tf.Graph()
with graph.as_default():
    model = SkipGramModel(VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE)
    model.build()

In [None]:
from utils import make_dir

def train_model(model, batch_gen, num_train_steps, learning_rate, skip_step):
    make_dir("checkpoints")
 
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
    
    with tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        sess.run(tf.global_variables_initializer())
        ckpt = tf.train.get_checkpoint_state(os.path.dirname("checkpoints/checkpoint"))

        # if that checkpoint exists, restore from checkpoint
        if ckpt and os.path.isfile(ckpt.model_checkpoint_path):
            model.saver.restore(sess, ckpt.model_checkpoint_path)

        total_loss = 0.0  # we use this to calculate late average loss in the last SKIP_STEP steps
        writer = tf.summary.FileWriter("improved_graph/lr" + str(learning_rate), sess.graph)
        
        initial_step = model.global_step.eval()
        for index in range(initial_step, initial_step + num_train_steps):
            centers, targets = next(batch_gen)
            
            center_words, target_words = model.placeholders
            feed_dict = {center_words: centers, target_words: targets}
            loss_batch, _, summary = sess.run([model.loss, model.optimizer, model.summary],
                                              feed_dict=feed_dict)
            
            writer.add_summary(summary, global_step=index)
            
            total_loss += loss_batch
            if (index + 1) % skip_step == 0:
                print("Average loss at step {:>5}: {:5.1f}".format(index, total_loss / skip_step))
                total_loss = 0.0
                model.saver.save(sess, "checkpoints/skip-gram", index)

        final_embed_matrix = sess.run(model.embedding)
        return final_embed_matrix

In [None]:
final_embed_matrix = train_model(model, batch_gen, NUM_TRAIN_STEPS, LEARNING_RATE, SKIP_STEP)

## Results

### Embedding

In [None]:
tsne = TSNE(n_components=2, random_state=42)
X_embeded = tsne.fit_transform(final_embed_matrix)

In [None]:
tsne_df = pd.DataFrame(X_embeded, columns=["x0", "x1"])
unique_codones = sorted(dictionary, key=dictionary.get)
tsne_df["codone"] = list(unique_codones)
tsne_df.head()

In [None]:
filename = "data/acid_properties.csv"
props = pd.read_csv(filename)

In [None]:
def acid_dict(some_c, props):
    prop_by_letter = [props[props.acid == let].iloc[:, 1:] for let in some_c]   
    df_concat = pd.concat(prop_by_letter)
    res = df_concat.mean()
    dres = dict(res)
    dres["acid"] = some_c
    return dres

In [None]:
save_path = "data/all_acid_dicts.pickle"
producer = lambda: [acid_dict(some_c, props) for some_c in tsne_df.codone]
all_acid_dicts = read_or_create(save_path, producer)

In [None]:
all_acid_df = pd.DataFrame(all_acid_dicts)
all_acid_df.head()

In [None]:
final_df = all_acid_df.join(tsne_df.set_index("codone"), on="acid")
final_df.head()

### Visualization

In [None]:
def plot_embedding_properties(final_df):
    """ Plot properties of our acids """
    
    plt.figure(figsize=(25, 20))
    for i, p in enumerate(["hydrophobicity", "mass", "number_of_atoms", "volume"]):
        plt.subplot(2,2,i+1)
        plt.title(p, fontsize=25)
        plt.scatter(final_df.x0, final_df.x1, c=final_df[p], s=10)
    plt.show()

In [None]:
plot_embedding_properties(final_df)