In [1]:
import tensorflow as tf
import tensorflow_addons as tfa

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

 The versions of TensorFlow you are currently using is 2.10.1 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
gpus = tf.config.list_physical_devices('GPU')
print(gpus)
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
1 Physical GPUs, 1 Logical GPU


## Dataset Preparation

In [6]:
class Dataset:
    def __init__(self):
        self.inp_lang_tokenizer = None
        self.targ_lang_tokenizer = None
    
    def create_dataset(self, path, num_examples):
        # path : path to spa-eng.txt file
        # num_examples : Limit the total number of training example for faster training (set num_examples = len(lines) to use full data)
        lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
        word_pairs = [[w for w in l.split('\t')]  for l in lines[:num_examples]]
        print(word_pairs[:5])

        return zip(*word_pairs)

    # Step 3 and Step 4
    def tokenize(self, lang):
        # lang = list of sentences in a language
        
        # print(len(lang), "example sentence: {}".format(lang[0]))
        lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>')
        lang_tokenizer.fit_on_texts(lang)

        ## tf.keras.preprocessing.text.Tokenizer.texts_to_sequences converts string (w1, w2, w3, ......, wn) 
        ## to a list of correspoding integer ids of words (id_w1, id_w2, id_w3, ...., id_wn)
        tensor = lang_tokenizer.texts_to_sequences(lang) 

        ## tf.keras.preprocessing.sequence.pad_sequences takes argument a list of integer id sequences 
        ## and pads the sequences to match the longest sequences in the given input
        tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    
        return tensor, lang_tokenizer

    def load_dataset(self, path, num_examples=None):
        # creating cleaned input, output pairs
        inp_lang, targ_lang = self.create_dataset(path, num_examples)

        input_tensor, inp_lang_tokenizer = self.tokenize(inp_lang)
        target_tensor, targ_lang_tokenizer = self.tokenize(targ_lang)

        return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

    def call(self, num_examples, BUFFER_SIZE, BATCH_SIZE):
        file_path = "../Data/StackExchange/final_data/training/pairs_full.txt"
        
        input_tensor, target_tensor, self.inp_lang_tokenizer, self.targ_lang_tokenizer = self.load_dataset(file_path, num_examples)
        
        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

        train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train))
        train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

        val_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))
        val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

        return train_dataset, val_dataset, self.inp_lang_tokenizer, self.targ_lang_tokenizer

In [7]:
BUFFER_SIZE = 600000
BATCH_SIZE = 64
# Let's limit the #training examples for faster training
num_examples = 600000

dataset_creator = Dataset()
train_dataset, val_dataset, inp_lang, targ_lang = dataset_creator.call(num_examples, BUFFER_SIZE, BATCH_SIZE)

[['<start> what is the differ between intel and ppc what is the hardwar and softwar differ between intel and ppc mac <end>', '<start> hardware mac powerpc macos <end>'], ['<start> turn on back to my mac via a script or command line the vpn softwar i us for work ipsecurita requir me to turn off back to my mac to start it s connect so i frequent turn off back to my mac in order to us my vpn connect the program doe thi for me i forget to turn it back on howev and i d love to know if there wa someth i could run script command to turn it back <end>', '<start> macos mobileme terminal back-to-my-mac script <end>'], ['<start> why doesn t microsoft offic 2008 later support rtl languag i have microsoft offic 2008 on my macbook offic doesn t support rtl languag like farsi and arab and i know that offic 2010 for window also ha the same do you think the lack of support is becaus of busi competit or some other reason <end>', '<start> software microsoft-office <end>'], ['<start> repair disk start up 

In [8]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 7192]), TensorShape([64, 8]))

In [9]:
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1
max_length_input = example_input_batch.shape[1]
max_length_output = example_target_batch.shape[1]

embedding_dim = 64
units = 256
steps_per_epoch = num_examples//BATCH_SIZE

In [10]:
print("max_length_text, max_length_tag, vocab_size_text, vocab_size_tag")
max_length_input, max_length_output, vocab_inp_size, vocab_tar_size

max_length_text, max_length_tag, vocab_size_text, vocab_size_tag


(7192, 8, 1262455, 42250)

## Seq2Seq Model

In [11]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        ##-------- LSTM layer in Encoder ------- ##
        self.lstm_layer = tf.keras.layers.LSTM(self.enc_units,
                                   dropout=0.2, 
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    


    def call(self, x, hidden):
        x = self.embedding(x)
        output, h, c = self.lstm_layer(x, initial_state = hidden)
        return output, h, c

    def initialize_hidden_state(self):
        return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))] 

In [12]:
## Test Encoder Stack

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)


# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder h vecotr shape: (batch size, units) {}'.format(sample_h.shape))
print ('Encoder c vector shape: (batch size, units) {}'.format(sample_c.shape))

Encoder output shape: (batch size, sequence length, units) (64, 7192, 256)
Encoder h vecotr shape: (batch size, units) (64, 256)
Encoder c vector shape: (batch size, units) (64, 256)


In [13]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, attention_type='luong'):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.attention_type = attention_type
    
        # Embedding Layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    
        #Final Dense layer on which softmax will be applied
        self.fc = tf.keras.layers.Dense(vocab_size)

        # Define the fundamental cell for decoder recurrent structure
        self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)
   


        # Sampler
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()

        # Create attention mechanism with memory = None
        self.attention_mechanism = self.build_attention_mechanism(self.dec_units, 
                                                              None, self.batch_sz*[max_length_input], self.attention_type)

        # Wrap attention mechanism with the fundamental rnn cell of decoder
        self.rnn_cell = self.build_rnn_cell(batch_sz)

        # Define the decoder with respect to fundamental rnn cell
        self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)

    
    def build_rnn_cell(self, batch_sz):
        rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell, 
                                  self.attention_mechanism, attention_layer_size=self.dec_units)
        return rnn_cell

    def build_attention_mechanism(self, dec_units, memory, memory_sequence_length, attention_type='luong'):
        # ------------- #
        # typ: Which sort of attention (Bahdanau, Luong)
        # dec_units: final dimension of attention outputs 
        # memory: encoder hidden states of shape (batch_size, max_length_input, enc_units)
        # memory_sequence_length: 1d array of shape (batch_size) with every element set to max_length_input (for masking purpose)

        if(attention_type=='bahdanau'):
            return tfa.seq2seq.BahdanauAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)
        else:
            return tfa.seq2seq.LuongAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)

    def build_initial_state(self, batch_sz, encoder_state, Dtype):
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_sz, dtype=Dtype)
        decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
        return decoder_initial_state

    def call(self, inputs, initial_state):
        x = self.embedding(inputs)
        outputs, _, _ = self.decoder(x, initial_state=initial_state, sequence_length=self.batch_sz*[max_length_output-1])
        return outputs

In [14]:
# Test decoder stack

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE, 'luong')
sample_x = tf.random.uniform((BATCH_SIZE, max_length_output))
decoder.attention_mechanism.setup_memory(sample_output)
initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_h, sample_c], tf.float32)


sample_decoder_outputs = decoder(sample_x, initial_state)

print("Decoder Outputs Shape: ", sample_decoder_outputs.rnn_output.shape)

Decoder Outputs Shape:  (64, 7, 42250)


In [15]:
optimizer = tf.keras.optimizers.Adam()


def loss_function(real, pred):
    # real shape = (BATCH_SIZE, max_length_output)
    # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
    cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = cross_entropy(y_true=real, y_pred=pred)
    mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)  
    loss = mask* loss
    loss = tf.reduce_mean(loss)
    return loss  

In [16]:
checkpoint_dir = './tagger_checkpoints_223'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## Train the model

In [17]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_h, enc_c = encoder(inp, enc_hidden)


        dec_input = targ[ : , :-1 ] # Ignore <end> token
        real = targ[ : , 1: ]         # ignore <start> token

        # Set the AttentionMechanism object with encoder_outputs
        decoder.attention_mechanism.setup_memory(enc_output)

        # Create AttentionWrapperState as initial_state for decoder
        decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
        pred = decoder(dec_input, decoder_initial_state)
        logits = pred.rnn_output
        loss = loss_function(real, logits)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss

In [18]:
EPOCHS = 50

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    # print(enc_hidden[0].shape, enc_hidden[1].shape)

    for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 1000 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
    # saving (checkpoint) the model every 5 epochs
    if (epoch + 1) % 5 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

ResourceExhaustedError: Graph execution error:

Detected at node 'gradient_tape/decoder/basic_decoder/decoder/while/gradients/decoder/basic_decoder/decoder/while/attention_wrapper/LuongAttention/MatMul_grad/MatMul_1' defined at (most recent call last):
    File "C:\Users\wangs\AppData\Local\Programs\Python\Python38\lib\runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\wangs\AppData\Local\Programs\Python\Python38\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
      app.start()
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\ipykernel\kernelapp.py", line 711, in start
      self.io_loop.start()
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\wangs\AppData\Local\Programs\Python\Python38\lib\asyncio\base_events.py", line 570, in run_forever
      self._run_once()
    File "C:\Users\wangs\AppData\Local\Programs\Python\Python38\lib\asyncio\base_events.py", line 1859, in _run_once
      handle._run()
    File "C:\Users\wangs\AppData\Local\Programs\Python\Python38\lib\asyncio\events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\ipykernel\kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell
      await result
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\ipykernel\kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\ipykernel\ipkernel.py", line 411, in do_execute
      res = shell.run_cell(
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\ipykernel\zmqshell.py", line 531, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\IPython\core\interactiveshell.py", line 2940, in run_cell
      result = self._run_cell(
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\IPython\core\interactiveshell.py", line 2995, in _run_cell
      return runner(coro)
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\IPython\core\interactiveshell.py", line 3194, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\IPython\core\interactiveshell.py", line 3373, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "C:\Users\wangs\Documents\GitFiles\Respondent_Recommendation\Model\.env\seq2seq\lib\site-packages\IPython\core\interactiveshell.py", line 3433, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\wangs\AppData\Local\Temp\ipykernel_30204\14857745.py", line 11, in <module>
      batch_loss = train_step(inp, targ, enc_hidden)
    File "C:\Users\wangs\AppData\Local\Temp\ipykernel_30204\442984770.py", line 22, in train_step
      gradients = tape.gradient(loss, variables)
Node: 'gradient_tape/decoder/basic_decoder/decoder/while/gradients/decoder/basic_decoder/decoder/while/attention_wrapper/LuongAttention/MatMul_grad/MatMul_1'
OOM when allocating tensor with shape[64,7192,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node gradient_tape/decoder/basic_decoder/decoder/while/gradients/decoder/basic_decoder/decoder/while/attention_wrapper/LuongAttention/MatMul_grad/MatMul_1}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_step_5007]

## Evaluation

In [17]:
def evaluate_sentence(sentence):
    #sentence = dataset_creator.preprocess_sentence(sentence)
    inputs = [inp_lang.word_index[i] for i in sentence.split(' ') if i in inp_lang.word_index.keys()]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_length_input,
                                                          padding='post')
    inputs = tf.convert_to_tensor(inputs)
    inference_batch_size = inputs.shape[0]
    result = ''

    enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]
    enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

    dec_h = enc_h
    dec_c = enc_c

    start_tokens = tf.fill([inference_batch_size], targ_lang.word_index['<start>'])
    end_token = targ_lang.word_index['<end>']

    greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

    # Instantiate BasicDecoder object
    decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc)
    # Setup Memory in decoder stack
    decoder.attention_mechanism.setup_memory(enc_out)
    # set decoder_initial_state
    decoder_initial_state = decoder.build_initial_state(inference_batch_size, [enc_h, enc_c], tf.float32)


    ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder 
    ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this. 
    ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function
    decoder_embedding_matrix = decoder.embedding.variables[0]
  
    outputs, _, _ = decoder_instance(decoder_embedding_matrix, start_tokens = start_tokens, end_token= end_token, initial_state=decoder_initial_state)
    return outputs.sample_id.numpy()


def translate(sentence):
    result = evaluate_sentence(sentence)
    print(result)
    result = targ_lang.sequences_to_texts(result)
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

def translate_batch(sentence):
    result = []
    try:
        result = evaluate_sentence(sentence)
        #print(result)
        result = targ_lang.sequences_to_texts(result)
        #print('Input: %s' % (sentence))
        #print('Predicted translation: {}'.format(result))
    except:
        result.append("Failed!!!!!!")
    
    return result

In [57]:
def f1_calc(pred_tag, true_tag):
    
    TP = 0
    FP = 0
    FN = 0
    precision_avg = 0
    recall_avg = 0
    precision = []
    recall = []
    
    for ptag, ttag in zip(pred_tag, true_tag):
        pred_set = set(ptag)
        true_set = set(ttag)
    
        TP = len(pred_set.intersection(true_set))
        FP = len(pred_set.difference(true_set))
        FN = len(true_set.difference(pred_set))
    
        precision_avg += TP / (TP + FP)
        recall_avg += TP / (TP + FN)
        precision.append(TP / (TP + FP))
        recall.append(TP / (TP + FN))
        
    precision_final = precision_avg / len(true_tag)
    recall_final = recall_avg / len(true_tag)
    
    F1 = 2 * (precision_final * recall_final) / (precision_final + recall_final)
    
    print("precision:",precision_final,"recall:",recall_final,"f1:",F1)
    return precision, recall

In [15]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x2452020e1c0>

In [16]:
translate(u'rxj subject someon background doc subject href htt github com reactiv extens rxj blob master doc subject rel nofollow doc lanat background someon conc com someth')

[[ 109  438 4337    3]]
Input: rxj subject someon background doc subject href htt github com reactiv extens rxj blob master doc subject rel nofollow doc lanat background someon conc com someth
Predicted translation: ['webpack rxjs flutter-animation <end>']


In [18]:
translate(u'angular js directiv control code in case')

[[  34  443  126   34 3529    3]]
Input: angular js directiv control code in case
Predicted translation: ['angularjs gruntjs internet-explorer angularjs zone.js <end>']


path = "evaluation/posdata_preprocessed/"
#file_list = os.listdir(path)
file_list = [int(filename) for filename in os.listdir(path)]
file_list = sorted(file_list)

issue_preprocessed = []

for filename in file_list:
    with open(path+str(filename), 'r') as f:
        issue_preprocessed.append(f.readline())
    

tags_predicted = [translate_batch(u''+issue) for issue in issue_preprocessed]

In [19]:
import pandas as pd
df = pd.read_csv("evaluation/validation_stack.csv")


In [20]:
tags_predicted = [translate_batch(u''+issue) for issue in list(df["processed_new"].values)]

In [21]:
df["Tags_new"] = tags_predicted

In [22]:
#df.to_csv("evaluation/validation_stack_result.csv",index = None)

In [37]:
len(tags_predicted)

108489

In [38]:
pred_tag = [tags_predicted[i][0][:-5].strip().split(' ') for i in range(len(tags_predicted))]

In [39]:
pred_tag[:10]

[['iphone', 'email', 'exchange-activesync', 'xpc', 'javascript-debugger'],
 ['macbook-pro', 'time-machine', 'macos'],
 ['iphone', 'passwords', 'encryption', 'pc'],
 ['macos', 'software-recommendation', 'software-rec', 'client-relations'],
 ['macbook-pro', 'encryption', '3rd-party'],
 ['iphone', 'software-recommendation', 'applications', 'layer', 'display'],
 ['macbook-pro', 'video', 'television', 'video-adapter'],
 ['macos', 'cocoa', 'lock-screen', 'unauthorizedaccessexcepti'],
 ['macos', 'snow-leopard', 'crash', 'hang'],
 ['macbook-pro', 'power']]

In [43]:
true_tag = [item.strip().split(' ') for item in list(df["Tag_True"].values)]

In [44]:
true_tag[:5]

[['iphone', 'software-recommendation'],
 ['macbook-pro'],
 ['iphone', 'software-recommendation'],
 ['macos', 'software-recommendation', 'snow-leopard'],
 ['macbook-pro']]

In [45]:
len(true_tag)

108489

In [58]:
prec,recall = f1_calc(pred_tag,true_tag)

precision: 0.36040735312646655 recall: 0.3820356595292487 f1: 0.3709064763323706


In [59]:
prec[:10]

[0.2,
 0.3333333333333333,
 0.25,
 0.5,
 0.3333333333333333,
 0.6,
 0.25,
 0.0,
 0.25,
 0.5]

In [60]:
recall[:10]

[0.5, 1.0, 0.5, 0.6666666666666666, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0]

In [61]:
df["precision"] = prec
df["recall"] = recall

In [62]:
df.head()

Unnamed: 0,Input,Tag_True,processed_new,Tags_new,precision,recall
0,iPhone App for Displaying Email on a Locked Sc...,iphone software-recommendation,iphon app displai email screen exchang support...,[iphone email exchange-activesync xpc javascri...,0.2,0.5
1,How can I tell when it's a good time to buy a ...,macbook-pro,tell time bui macbook want date instanc date g...,[macbook-pro time-machine macos <end>],0.333333,1.0
2,"Password keeper for iPhone, Mac and Windows? <...",iphone software-recommendation,password keeper iphon mac window solut sync ac...,[iphone passwords encryption pc <end>],0.25,0.5
3,Good Newsgroup Client for OS X <p>I've just sw...,macos software-recommendation snow-leopard,newsgroup client x ve switch os x struggl find...,[macos software-recommendation software-rec cl...,0.5,0.666667
4,"is there a slipcase for 17"" MacBook Pros which...",macbook-pro,slipcas macbook pro open side carri mbp pannie...,[macbook-pro encryption 3rd-party <end>],0.333333,1.0


In [63]:
df.to_csv("evaluation/validation_stack_result.csv",index = None)

In [77]:
def beam_evaluate_sentence(sentence, beam_width=3):
  #sentence = dataset_creator.preprocess_sentence(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_length_input,
                                                          padding='post')
  inputs = tf.convert_to_tensor(inputs)
  inference_batch_size = inputs.shape[0]
  result = ''

  enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]
  enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

  dec_h = enc_h
  dec_c = enc_c

  start_tokens = tf.fill([inference_batch_size], targ_lang.word_index['<start>'])
  end_token = targ_lang.word_index['<end>']

  # From official documentation
  # NOTE If you are using the BeamSearchDecoder with a cell wrapped in AttentionWrapper, then you must ensure that:
  # The encoder output has been tiled to beam_width via tfa.seq2seq.tile_batch (NOT tf.tile).
  # The batch_size argument passed to the get_initial_state method of this wrapper is equal to true_batch_size * beam_width.
  # The initial state created with get_initial_state above contains a cell_state value containing properly tiled final state from the encoder.

  enc_out = tfa.seq2seq.tile_batch(enc_out, multiplier=beam_width)
  decoder.attention_mechanism.setup_memory(enc_out)
  print("beam_with * [batch_size, max_length_input, rnn_units] :  3 * [1, 16, 1024]] :", enc_out.shape)

  # set decoder_inital_state which is an AttentionWrapperState considering beam_width
  hidden_state = tfa.seq2seq.tile_batch([enc_h, enc_c], multiplier=beam_width)
  decoder_initial_state = decoder.rnn_cell.get_initial_state(batch_size=beam_width*inference_batch_size, dtype=tf.float32)
  decoder_initial_state = decoder_initial_state.clone(cell_state=hidden_state)

  # Instantiate BeamSearchDecoder
  decoder_instance = tfa.seq2seq.BeamSearchDecoder(decoder.rnn_cell,beam_width=beam_width, output_layer=decoder.fc)
  decoder_embedding_matrix = decoder.embedding.variables[0]

  # The BeamSearchDecoder object's call() function takes care of everything.
  outputs, final_state, sequence_lengths = decoder_instance(decoder_embedding_matrix, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state)
  # outputs is tfa.seq2seq.FinalBeamSearchDecoderOutput object. 
  # The final beam predictions are stored in outputs.predicted_id
  # outputs.beam_search_decoder_output is a tfa.seq2seq.BeamSearchDecoderOutput object which keep tracks of beam_scores and parent_ids while performing a beam decoding step
  # final_state = tfa.seq2seq.BeamSearchDecoderState object.
  # Sequence Length = [inference_batch_size, beam_width] details the maximum length of the beams that are generated

  
  # outputs.predicted_id.shape = (inference_batch_size, time_step_outputs, beam_width)
  # outputs.beam_search_decoder_output.scores.shape = (inference_batch_size, time_step_outputs, beam_width)
  # Convert the shape of outputs and beam_scores to (inference_batch_size, beam_width, time_step_outputs)
  final_outputs = tf.transpose(outputs.predicted_ids, perm=(0,2,1))
  beam_scores = tf.transpose(outputs.beam_search_decoder_output.scores, perm=(0,2,1))
  
  return final_outputs.numpy(), beam_scores.numpy()

In [78]:
def beam_translate(sentence):
  result, beam_scores = beam_evaluate_sentence(sentence)
  print(result.shape, beam_scores.shape)
  for beam, score in zip(result, beam_scores):
    print(beam.shape, score.shape)
    output = targ_lang.sequences_to_texts(beam)
    output = [a[:a.index('<end>')] for a in output]
    beam_score = [a.sum() for a in score]
    print('Input: %s' % (sentence))
    for i in range(len(output)):
      print('{} Predicted translation: {}  {}'.format(i+1, output[i], beam_score[i]))

In [79]:
beam_translate(u'angular js directiv control code in case')

KeyError: 'directiv'