### Doc2Vec Model
---------------------------------------
From this data set we will compute/fit a Doc2Vec model to get
Document vectors.  From these document vectors, we will evaluate how the training progresses using a few words.

In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import random
import string
import collections

from time import time
from tensorflow.python.framework import ops
ops.reset_default_graph()

  from ._conv import register_converters as _register_converters


In [2]:
tf.__version__

'1.5.0'

In [3]:
# Cleanse text
def cleanse_text(texts):
    # texts: list of strings
    # Lower case, remove punctuation and trim extra whitespaces
    # Not removing numbers as most product versions need the number
    
    # Lower case
    texts = [txt.lower() for txt in texts]
    # Remove punctuation
    texts = [''.join(c for c in txt if c not in string.punctuation) for txt in texts]
    # Trim extra whitespace
    texts = [' '.join(txt.split()) for txt in texts]
    
    return(texts)

In [4]:
# Build dictionary of words
def build_dictionary(sentences, vocabulary_size):
    # Convert list of strings into lists of words
    split_sentences = [sent.split() for sent in sentences]
    words = [txt for wordlist in split_sentences for txt in wordlist]
    
    # For each word, initialize list of [word, word_count] 
    # Begin with unknown words
    count = [['RARE', -1]]
    
    # Add N-most frequent words (N=vocabulary size)
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    
    # Create the dictionary
    word_dict = {}
    # For every word needed in the dictionary, add it and set its value
    # to the prior dictionary length
    for word, word_count in count:
        word_dict[word] = len(word_dict)
    
    return(word_dict)

In [5]:
# Create a TensorFlow session
sess = tf.Session()

In [6]:
# Declare model parameters
batch_size = 500
vocabulary_size = 10000
generations = 500000
model_learning_rate = 0.001

embedding_size = 50   # Word embedding size
doc_embedding_size = 50   # Document embedding size
concatenated_size = embedding_size + doc_embedding_size

num_sampled = int(batch_size / 2)    # Number of negative examples to sample
window_size = 3       # Words to consider to the left

# Add checkpoints to training
save_embeddings_every = 10000
print_valid_every = 10000
print_loss_every = 1000

In [7]:
from pandas import read_csv
df = read_csv("/Users/i337036/Documents/Data/ICS/Train_ICS_65K_Filt.csv", 
              encoding='utf-8')

"Read %d lines from file" % len(df.index)

'Read 65813 lines from file'

In [8]:
texts = df.Query.tolist()
texts = cleanse_text(texts)

In [9]:
# Texts must contain at least 3 words
texts = [x for x in texts if len(x.split()) > window_size]    

In [10]:
# Build our data set and dictionaries
print('Creating Dictionary')
word_dictionary = build_dictionary(texts, vocabulary_size)
word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys()))

Creating Dictionary


In [11]:
# Convert text data into lists of integers from dictionary
def sent_text_to_sent_int(sentences, word_dict):
    data = []
    for sentence in sentences:
        sentence_data = []
        # For each word, either use selected index or rare word index
        for word in sentence.split():
            if word in word_dict:
                word_idx = word_dict[word]
            else:
                word_idx = 0
            sentence_data.append(word_idx)
        data.append(sentence_data)
    return(data)

In [12]:
text_data = sent_text_to_sent_int(texts, word_dictionary)

In [13]:
valid_words = ['problem', 'time', 'unix', 'issue', 'database', 'file']
# Get validation word keys
valid_examples = [word_dictionary[x] for x in valid_words]

In [14]:
print('Creating Model')
# Define Embeddings:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], 
                                           -1.0, 1.0))
doc_embeddings = tf.Variable(tf.random_uniform([len(texts), doc_embedding_size], 
                                               -1.0, 1.0))

# NCE loss parameters
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, concatenated_size],
                                               stddev=1.0 / np.sqrt(concatenated_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# Create data/target placeholders
# x_inputs dims plus 1 for doc index
x_inputs = tf.placeholder(tf.int32, shape=[None, window_size + 1]) 
y_target = tf.placeholder(tf.int32, shape=[None, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

# Lookup the word embedding
# Add together element embeddings in window:
embed = tf.zeros([batch_size, embedding_size])
for element in range(window_size):
    embed += tf.nn.embedding_lookup(embeddings, x_inputs[:, element])

doc_indices = tf.slice(x_inputs, [0,window_size],[batch_size,1])
doc_embed = tf.nn.embedding_lookup(doc_embeddings,doc_indices)

# concatenate embeddings
final_embed = tf.concat([embed, tf.squeeze(doc_embed)], 1)

Creating Model


In [15]:
# Get loss from prediction
loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases, y_target, final_embed,
                                     num_sampled, vocabulary_size))

In [16]:
# Create optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate=model_learning_rate)
train_step = optimizer.minimize(loss)

In [18]:
# Cosine similarity between words
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

# Create model saving operation
saver = tf.train.Saver({"embeddings": embeddings, "doc_embeddings": doc_embeddings})

In [19]:
#Add variable initializer.
init = tf.global_variables_initializer()
sess.run(init)

In [20]:
# Generate data randomly (N words behind, target, N words ahead)
def generate_batch_data(sentences, batch_size, window_size, method='skip_gram'):
    # Fill up data batch
    batch_data = []
    label_data = []
    while len(batch_data) < batch_size:
        # select random sentence to start
        rand_sentence_ix = int(np.random.choice(len(sentences), size=1))
        rand_sentence = sentences[rand_sentence_ix]
        # Generate consecutive windows to look at
        window_sequences = [rand_sentence[max((ix-window_size),0):(ix+window_size+1)] for ix, x in enumerate(rand_sentence)]
        # Denote which element of each window is the center word of interest
        label_indices = [ix if ix<window_size else window_size for ix,x in enumerate(window_sequences)]

        # For doc2vec we keep LHS window only to predict target word
        batch_and_labels = [(rand_sentence[i:i+window_size], 
                             rand_sentence[i+window_size]) for i in range(0, len(rand_sentence)-window_size)]
        batch, labels = [list(x) for x in zip(*batch_and_labels)]
        # Add document index to batch!! Remember that we must extract the last index in batch for the doc-index
        batch = [x + [rand_sentence_ix] for x in batch]
            
        # extract batch and labels
        batch_data.extend(batch[:batch_size])
        label_data.extend(labels[:batch_size])
        
    # Trim batch and label at the end
    batch_data = batch_data[:batch_size]
    label_data = label_data[:batch_size]
    
    # Convert to numpy array
    batch_data = np.array(batch_data)
    label_data = np.transpose(np.array([label_data]))
    
    return(batch_data, label_data)

In [21]:
# Run the skip gram model.
print('Starting Training')
loss_vec = []
loss_x_vec = []
for i in range(generations):
    batch_inputs, batch_labels = generate_batch_data(text_data, batch_size, 
                                                     window_size, method='doc2vec')
    feed_dict = {x_inputs : batch_inputs, y_target : batch_labels}

    # Run the train step
    sess.run(train_step, feed_dict=feed_dict)

    # Return the loss
    if (i+1) % print_loss_every == 0:
        loss_val = sess.run(loss, feed_dict=feed_dict)
        loss_vec.append(loss_val)
        loss_x_vec.append(i+1)
        print('Loss at step {} : {}'.format(i+1, loss_val))
    
    # Validation: Print some random words and top 5 related words
    if (i+1) % print_valid_every == 0:
        sim = sess.run(similarity, feed_dict=feed_dict)
        for j in range(len(valid_words)):
            valid_word = word_dictionary_rev[valid_examples[j]]
            top_k = 5 # number of nearest neighbors
            nearest = (-sim[j, :]).argsort()[1:top_k+1]
            log_str = "Nearest to {}:".format(valid_word)
            for k in range(top_k):
                close_word = word_dictionary_rev[nearest[k]]
                log_str = '{} {},'.format(log_str, close_word)
            print(log_str)
            
    # Save dictionary + embeddings
    if (i+1) % save_embeddings_every == 0:
        # Save vocabulary dictionary
        with open('./ics_vocab.pkl', 'wb') as f:
            pickle.dump(word_dictionary, f)
        
        # Save embeddings
        model_checkpoint_path = os.path.join(os.getcwd(),
                                             'doc2vec_ics_embeddings.ckpt')
        save_path = saver.save(sess, model_checkpoint_path)
        print('Model saved in file: {}'.format(save_path))

Starting Training
Loss at step 1000 : 602.367431640625
Loss at step 2000 : 574.9002075195312
Loss at step 3000 : 498.781494140625
Loss at step 4000 : 423.64117431640625
Loss at step 5000 : 379.9668884277344
Loss at step 6000 : 332.863037109375
Loss at step 7000 : 299.1253662109375
Loss at step 8000 : 172.3739013671875
Loss at step 9000 : 215.79055786132812
Loss at step 10000 : 182.01170349121094
Nearest to problem: change, scheduler, unixodbc, nuc, 24994errrte,
Nearest to time: system, error, oracle, sap, i,
Nearest to unix: hence, cldb6calendaraction, messagetypex, 0x7ef2993edb30, tid,
Nearest to issue: system, RARE, sap, server, upgrade,
Nearest to database: we, RARE, error, system, hana,
Nearest to file: please, RARE, i, start, system,
Model saved in file: /Users/i337036/Documents/OneDrive - SAP SE/Code/Solution-Recommendation-Test-Scripts/D2V_TensorFlow/doc2vec_ics_embeddings.ckpt
Loss at step 11000 : 186.07894897460938
Loss at step 12000 : 179.88137817382812
Loss at step 13000 : 1

Loss at step 96000 : 22.755002975463867
Loss at step 97000 : 26.230710983276367
Loss at step 98000 : 13.560073852539062
Loss at step 99000 : 22.027772903442383
Loss at step 100000 : 19.120948791503906
Nearest to problem: change, failed, oracle, note, system,
Nearest to time: oracle, user, system, i, find,
Nearest to unix: hence, cldb6calendaraction, 0x7ef2993edb30, tid, messagetypex,
Nearest to issue: system, server, information, database, data,
Nearest to database: we, server, data, hana, error,
Nearest to file: start, i, please, oracle, table,
Model saved in file: /Users/i337036/Documents/OneDrive - SAP SE/Code/Solution-Recommendation-Test-Scripts/D2V_TensorFlow/doc2vec_ics_embeddings.ckpt
Loss at step 101000 : 18.033479690551758
Loss at step 102000 : 18.002376556396484
Loss at step 103000 : 19.78308868408203
Loss at step 104000 : 18.29255485534668
Loss at step 105000 : 17.760053634643555
Loss at step 106000 : 16.59227180480957
Loss at step 107000 : 16.932479858398438
Loss at step 10

Loss at step 185000 : 13.175252914428711
Loss at step 186000 : 13.629258155822754
Loss at step 187000 : 19.806350708007812
Loss at step 188000 : 29.10578155517578
Loss at step 189000 : 14.32315444946289
Loss at step 190000 : 12.766911506652832
Nearest to problem: change, failed, performance, oracle, systems,
Nearest to time: user, oracle, find, connection, phase,
Nearest to unix: hence, cldb6calendaraction, tid, 0x7ef2993edb30, text,
Nearest to issue: information, server, data, database, system,
Nearest to database: server, data, we, hana, need,
Nearest to file: start, oracle, i, db2, memory,
Model saved in file: /Users/i337036/Documents/OneDrive - SAP SE/Code/Solution-Recommendation-Test-Scripts/D2V_TensorFlow/doc2vec_ics_embeddings.ckpt
Loss at step 191000 : 12.77200984954834
Loss at step 192000 : 11.619488716125488
Loss at step 193000 : 14.649134635925293
Loss at step 194000 : 20.206497192382812
Loss at step 195000 : 14.413437843322754
Loss at step 196000 : 11.237857818603516
Loss a

Loss at step 274000 : 8.51225757598877
Loss at step 275000 : 9.966686248779297
Loss at step 276000 : 11.615030288696289
Loss at step 277000 : 10.180996894836426
Loss at step 278000 : 10.462092399597168
Loss at step 279000 : 8.66679859161377
Loss at step 280000 : 11.11754035949707
Nearest to problem: change, performance, failed, long, systems,
Nearest to time: user, oracle, find, connection, cannot,
Nearest to unix: hence, cldb6calendaraction, tid, 0x7ef2993edb30, text,
Nearest to issue: information, server, data, node, database,
Nearest to database: data, server, need, hana, upgrade,
Nearest to file: start, db2, memory, systems, available,
Model saved in file: /Users/i337036/Documents/OneDrive - SAP SE/Code/Solution-Recommendation-Test-Scripts/D2V_TensorFlow/doc2vec_ics_embeddings.ckpt
Loss at step 281000 : 10.913414001464844
Loss at step 282000 : 11.128129005432129
Loss at step 283000 : 9.447534561157227
Loss at step 284000 : 9.165542602539062
Loss at step 285000 : 8.650343894958496
L

Loss at step 363000 : 9.721260070800781
Loss at step 364000 : 11.393345832824707
Loss at step 365000 : 10.795470237731934
Loss at step 366000 : 9.941866874694824
Loss at step 367000 : 8.96379280090332
Loss at step 368000 : 7.543361186981201
Loss at step 369000 : 12.855720520019531
Loss at step 370000 : 10.84609603881836
Nearest to problem: change, performance, long, failed, systems,
Nearest to time: user, find, oracle, connection, available,
Nearest to unix: hence, tid, cldb6calendaraction, text, 0x7ef2993edb30,
Nearest to issue: information, server, node, data, get,
Nearest to database: data, server, need, hana, upgrade,
Nearest to file: start, db2, memory, systems, available,
Model saved in file: /Users/i337036/Documents/OneDrive - SAP SE/Code/Solution-Recommendation-Test-Scripts/D2V_TensorFlow/doc2vec_ics_embeddings.ckpt
Loss at step 371000 : 11.803038597106934
Loss at step 372000 : 11.885308265686035
Loss at step 373000 : 9.29813289642334
Loss at step 374000 : 13.583810806274414
Lo

Loss at step 453000 : 8.303462982177734
Loss at step 454000 : 8.13504409790039
Loss at step 455000 : 9.12993335723877
Loss at step 456000 : 8.547344207763672
Loss at step 457000 : 11.06844425201416
Loss at step 458000 : 8.91787338256836
Loss at step 459000 : 10.96448802947998
Loss at step 460000 : 9.514132499694824
Nearest to problem: change, performance, long, cache, failed,
Nearest to time: user, find, connection, oracle, available,
Nearest to unix: hence, tid, cldb6calendaraction, text, 0x7ef2993edb30,
Nearest to issue: information, node, server, get, data,
Nearest to database: data, server, need, hana, end,
Nearest to file: start, db2, memory, systems, required,
Model saved in file: /Users/i337036/Documents/OneDrive - SAP SE/Code/Solution-Recommendation-Test-Scripts/D2V_TensorFlow/doc2vec_ics_embeddings.ckpt
Loss at step 461000 : 8.14846420288086
Loss at step 462000 : 8.872955322265625
Loss at step 463000 : 9.379284858703613
Loss at step 464000 : 8.874483108520508
Loss at step 4650