In [None]:
# !pip install "tensorflow-text"
# !pip install einops

In [None]:
#EXP
# import pandas as pd
# convert csv to txt
def csv_to_txt():
  import pandas as pd
  path = '/content/captions_csv.csv'
  data = pd.read_csv(path)
  data.dropna()
  print(data.head())
  column_contents = data['Caption'].astype(str).values.tolist()
  column_contents
  # Write the column contents to a text file
  with open('captions.txt', 'w', encoding='utf-8') as f:
    for item in column_contents:
        item = item.encode('utf-8').decode('unicode_escape')
        f.write("%s\n" % item)
# csv_to_txt()

In [None]:
import numpy as np

import typing
from typing import Any, Tuple

import einops
# import matplotlib.pyplot as plt
# import matplotlib.ticker as ticker

import tensorflow as tf
import tensorflow_text as tf_text
import pathlib
import pickle

In [None]:
path_to_zip = tf.keras.utils.get_file('spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip', extract=True)


In [None]:
path_to_file = pathlib.Path('./bookstxt.txt')
# path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'
DATA_LIMIT = 5
def load_data(path):
  text = path.read_text(encoding='utf-8')
  lines = text.splitlines()
  print(type(lines))
  pairs = [line.split('\t') for line in lines]
  pairs = pairs[:DATA_LIMIT]
  # text = np.array(pairs)

  text = np.array([target for target in pairs])

  return text

data = load_data(path_to_file)
print(data.shape)

data = data[:, 0]
print(data.shape)
print(data)

In [None]:
len(data)

In [None]:
BUFFER_SIZE = len(data)
BATCH_SIZE = 16
is_train = np.random.uniform(size=(len(data),)) < 0.8
train_raw = tf.data.Dataset.from_tensor_slices(data[is_train]).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
val_raw = tf.data.Dataset.from_tensor_slices(data[~is_train] ).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [None]:
len(train_raw)

In [None]:
for context_string in train_raw.take(1):
  print(context_string[:])
  break

In [None]:
example_text = tf.constant("You have been invited to think of the two systems as agents within the mind.")
print(example_text.numpy())
print(tf_text.normalize_utf8(example_text, 'NFKD').numpy())

In [None]:
@tf.keras.utils.register_keras_serializable(package='Custom', name=None)
def tf_lower_and_split_punct(text):
  # Split accented characters
  text = tf_text.normalize_utf8(text, 'NFKD')
  text = tf.strings.lower(text)
  # Keep space, a to z, and select punctuation.
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  # add space arround punctuation
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  # remove non-desplayable characters
  text = tf.strings.regex_replace(text, '[^\x00-\x7F]+', '')
  #Strip white space
  text = tf.strings.strip(text)

  text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
  return text


In [None]:
print(example_text.numpy().decode())
print(tf_lower_and_split_punct(example_text).numpy().decode())

In [None]:
#Text Vectorization for the context text data (spainish)
max_vocab_size = 30000
context_text_processor = tf.keras.layers.TextVectorization(standardize=tf_lower_and_split_punct, max_tokens=max_vocab_size, ragged=True)

In [None]:
context_text_processor.adapt(train_raw)
print(context_text_processor.get_vocabulary()[:20])


In [None]:
pk.dump(context_text_processor.get_config(), open('text_processor_config.pkl', 'wb'))

In [None]:
# now layers can convert batch of strings to batch of token ids
example_tokens = context_text_processor(context_string)
print(example_tokens[:3, :])

In [None]:
#The get_vocabulary method can be used to convert token IDs back to text:
context_vocab = np.array(context_text_processor.get_vocabulary())
tokens = context_vocab[example_tokens[0].numpy()]
tokens = ' '.join(tokens)
print(tokens)

In [None]:

def process_text(context):
  target = context_text_processor(context)
  context = context_text_processor(context).to_tensor()
  # print(type(context))
  # targ_in = target[:, :-1].to_tensor() #take everthing in axiz = 0 and take everything except the last in axis = 2
  # targ_in = target[:, 1:].to_tensor()
  # targ_out = target[:, :-1].to_tensor()
  target = target[:, 1:]
  targ_in = target[:, :-1].to_tensor() #take everthing in axiz = 0 and take everything except the last in axis = 2
  targ_out = target[:, 1:].to_tensor()

  

  print('process_text')

  return (context, targ_in), targ_out

train_ds = train_raw.map(process_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)

In [None]:
len(train_ds)


In [None]:
for (example_context_tokens, target_in_tokens), target_out_tokens in train_ds.take(2):

  print(example_context_tokens[0, :10].numpy(), example_context_tokens[0, :].numpy().shape)
  print(target_in_tokens[0, :10].numpy(), target_in_tokens[0, :].numpy().shape)
  print(target_out_tokens[0, :10].numpy(), target_out_tokens[0, :].numpy().shape)
  print()

In [None]:
# target_in_tokens = target_in_tokens + 1
# print(target_in_tokens[0, :10].numpy(),'\n', target_out_tokens[0, :10].numpy())

In [None]:
'''
The encoder:
  1. Takes a list of token IDs (from context_text_processor).
  2. Looks up an embedding vector for each token (Using a layers.Embedding).
  3. Processes the embeddings into a new sequence (Using a bidirectional layers.GRU).
  4. Returns the processed sequence. This will be passed to the attention head.
'''


class Encoder(tf.keras.layers.Layer):
  def __init__(self, text_processor, units):
    super(Encoder, self).__init__()
    self.text_processor = text_processor
    self.vocab_size = text_processor.vocabulary_size()
    self.units = units

    #The embedding layer converts tokens into vectors
    self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.units, mask_zero=True)

    #The RNN layer processes those vectors sequentially
    self.rnn = tf.keras.layers.Bidirectional(merge_mode='sum', 
                                             layer=tf.keras.layers.GRU(self.units, return_sequences=True, recurrent_initializer='glorot_uniform' ))
  
  def call(self, x):
    x = self.embedding(x)
    x = self.rnn(x)
    # print('call')
    return x

  def convert_input(self, texts):
    texts = tf.convert_to_tensor(texts)
    if len(texts.shape) == 0:
      texts = tf.convert_to_tensor(texts)[tf.newaxis]
    context = self.text_processor(texts).to_tensor()
    context = self(context)
    return context


In [None]:
UNITS = 256

In [None]:
# Encode the input sequence.
encoder = Encoder(context_text_processor, UNITS)
ex_context = encoder(example_context_tokens)

print(f'Context tokens, shape (batch, s): {example_context_tokens.shape}')
print(f'Encoder output, shape (batch, s, units): {ex_context.shape}')

In [None]:
class CrossAttention(tf.keras.layers.Layer):
  def __init__(self, units, **kwargs):
    super().__init__()

    self.mha = tf.keras.layers.MultiHeadAttention(key_dim=units, num_heads=1, **kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

  def call(self, x, context):
    atten_output, atten_score = self.mha(query=x, value=context, return_attention_scores=True)
    x = self.add([x, atten_output])
    x = self.layernorm(x)
    return x

In [None]:
attention_layer = CrossAttention(UNITS)
# Attend to the encoded tokens
embed = tf.keras.layers.Embedding(context_text_processor.vocabulary_size(),
                                  output_dim=UNITS, mask_zero=True)


In [None]:
'''
The decoder's job is to generate predictions for the next token at each location in the target sequence.
  1. It looks up embeddings for each token in the target sequence.
  2. It uses an RNN to process the target sequence, and keep track of what it has generated so far.
  3. It uses RNN output as the "query" to the attention layer, when attending to the encoder's output.
  4. At each location in the output it predicts the next token.
'''

class Decoder(tf.keras.layers.Layer):
  @classmethod
  def add_method(cls, fun):
    setattr(cls, fun.__name__, fun)
    return fun

  def __init__(self, text_processor, units):
    super(Decoder, self).__init__()
    self.text_processor = text_processor
    self.vocab_size = text_processor.vocabulary_size()
    self.word_to_id = tf.keras.layers.StringLookup(vocabulary=text_processor.get_vocabulary(), mask_token='', oov_token='[UNK]')
    self.id_to_word = tf.keras.layers.StringLookup(vocabulary=text_processor.get_vocabulary(), mask_token='', oov_token='[UNK]', invert=True)
    self.start_token = self.word_to_id('[START]')
    self.end_token = self.word_to_id('[END]')

    self.units = units

    # 1. The embedding layer converts token IDs to vectors
    self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.units, mask_zero=True)
    # 2. The RNN keeps track of what's been generated so far.
    self.rnn = tf.keras.layers.GRU(self.units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    #3. The RNN output will be the query for the attention layer.
    self.attention = CrossAttention(self.units)

    # self.fc1 = tf.keras.layers.Dense(self.units, activation='relu')
    # 4. This fully connected layer produces the logits for each output token.
    self.output_layer = tf.keras.layers.Dense(self.vocab_size)

  def call(self, context, x, state=None, return_state=False):
    #Lookup for embeddings
    x = self.embedding(x)
    #Process the target sequence
    x, state = self.rnn(x, initial_state=state)
    #Use the rnn output as the query for the attention over the context
    x = self.attention(x, context)

    # x = self.fc1(x)

    #generate logit predictyions for the next token
    logits = self.output_layer(x)


    if return_state:
      return logits, state
    return logits




In [None]:
decoder = Decoder(context_text_processor, UNITS)

In [None]:
@Decoder.add_method
def get_initial_state(self, context):
  batch_size = tf.shape(context)[0]
  print(batch_size)
  start_tokens = tf.fill([batch_size, 1], self.start_token)

  # print(start_tokens, 'satrt token')
  # print(tf.constant(context), 'context')
  done = tf.zeros([batch_size, 1], dtype=tf.bool)
  embedded = self.embedding(start_tokens)
  return start_tokens, done, self.rnn.get_initial_state(embedded)[0]

# print(example_context_tokens, 'ex_context_tokens')
# ex_context = encoder(example_context_tokens)
# next_token, done, state = decoder.get_initial_state(ex_context)
# print(next_token, state)

In [None]:
@Decoder.add_method
def tokens_to_text(self, tokens):
  words = self.id_to_word(tokens)
  result = tf.strings.reduce_join(words, axis=-1, separator=' ')
  result = tf.strings.regex_replace(result, '^ *\[START\] *', '')
  result = tf.strings.regex_replace(result, ' *\[END\] *$', '')
  return result

# decoder.tokens_to_text(example_context_tokens)

In [None]:
@Decoder.add_method
def get_next_token(self, context, next_token, done, state, temperature = 0.0):

  logits, state = self(context, next_token, state = state, return_state=True) 

  if temperature == 0.0:
    next_token = tf.argmax(logits, axis=-1)
  else:
    logits = logits[:, -1, :]/temperature
    next_token = tf.random.categorical(logits, num_samples=1)

  # If a sequence produces an `end_token`, set it `done`
  # done = done | (next_token == self.end_token)
  # # Once a sequence is done it only produces 0-padding.
  # next_token = tf.where(done, tf.constant(0, dtype=tf.int64), next_token)

  return next_token, done, state

In [None]:
# Setup the loop variables.
next_token, done, state = decoder.get_initial_state(ex_context)
tokens = []

for n in range(10):
  # Run one step.
  next_token, done, state = decoder.get_next_token(
      ex_context, next_token, done, state, temperature=1.0)
  # Add the token to the output.
  tokens.append(next_token)

# Stack all the tokens together.
tokens = tf.concat(tokens, axis=-1) # (batch, t)

# Convert the tokens back to a a string
result = decoder.tokens_to_text(tokens)
result[:3].numpy()

In [None]:
class TextGen(tf.keras.Model):
  @classmethod
  def add_method(cls, fun):
    setattr(cls, fun.__name__, fun)
    return fun

  def __init__(self, units, context_text_processor, target_text_processor):
    super().__init__()
    #Build the encoder and the decoder
    self.encoder = Encoder(context_text_processor, units)
    self.decoder = Decoder(context_text_processor, units)

  def call(self, inputs):
    # print("here1")
    context, x = inputs
    # print("here2")

    context = self.encoder(context)
    # print("here3")

    logits = self.decoder(context, x)
    # print("here4")

    try:
      del logits._keras_mask
    except AttributeError:
      pass
    # print("here5", logits)
    return logits
  
  def gen(self,
                texts, *,
                max_length=50,
                temperature=0.0):
    # Process the input texts
    context = self.encoder.convert_input(texts)
    batch_size = tf.shape(texts)[0]

    # Setup the loop inputs
    tokens = []
    attention_weights = []
    next_token, done, state = self.decoder.get_initial_state(context)

    for _ in range(max_length):
      # Generate the next token
      next_token, done, state = self.decoder.get_next_token(context, next_token, done,  state, temperature)

      # Collect the generated tokens
      tokens.append(next_token)
      # attention_weights.append(self.decoder.last_attention_weights)

      # if tf.executing_eagerly() and tf.reduce_all(done):
      #   break

    # Stack the lists of tokens and attention weights.
    tokens = tf.concat(tokens, axis=-1)   # t*[(batch 1)] -> (batch, t)
    # self.last_attention_weights = tf.concat(attention_weights, axis=1)  # t*[(batch 1 s)] -> (batch, t s)

    result = self.decoder.tokens_to_text(tokens)
    return result

    

In [None]:
model = TextGen(UNITS, context_text_processor, context_text_processor)

In [None]:
def masked_loss(y_true, y_pred):
    # Calculate the loss for each item in the batch.
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    # print(y_true.shape, y_pred.shape, "masked_loss")
    loss = loss_fn(y_true, y_pred)
    

    # Mask off the losses on padding.
    mask = tf.cast(y_true != 0, loss.dtype)
    loss *= mask

    # Return the total.
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

In [None]:
#custom accuracy
def masked_acc(y_true, y_pred):
  # Calculate the loss for each item in the batch
  y_pred = tf.argmax(y_pred, axis=-1)
  y_pred = tf.cast(y_pred, y_true.dtype)

  match = tf.cast(y_true == y_pred, dtype=tf.float32)
  mask = tf.cast(y_true != 0, tf.float32)

  return tf.reduce_sum(match) / tf.reduce_sum(mask)


In [None]:
model.compile(optimizer='adam', loss=masked_loss, metrics=[masked_acc, masked_loss])

In [None]:
vocab_size = 1.0 * context_text_processor.vocabulary_size()

{"expected_loss": tf.math.log(vocab_size).numpy(),
 "expected_acc": 1/vocab_size}

In [None]:
# model.evaluate(val_ds, steps=20, return_dict=True)

In [None]:
history = model.fit(
    train_ds.repeat(), 
    epochs=10,
    steps_per_epoch = 20,
    validation_data=val_ds,
    validation_steps = 20,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=5, monitor='masked_acc')])

In [None]:
result = model.gen(['This was one of my favorite'], max_length=100)
result = result[0].numpy().decode()
print(result)

### SAVE 

In [None]:
model_path = f'{DATA_LIMIT}_model.tf'

In [None]:
#save the vectorization layer
pickle.dump(context_text_processor.get_config(), open('text_processor_config.pkl', 'wb'))
weights = context_text_processor.get_weights()
text_processor_weights = pickle.dump(weights, open("text_processor_weights.pkl", "wb"))
#save the model
model.save(model_path)

In [None]:
loaded_model = None
loaded_model = tf.keras.models.load_model(model_path, custom_objects={'tf_lower_and_split_punct': tf_lower_and_split_punct}, compile=False)

In [None]:
result = loaded_model.predict(['This was one of my favorite'])
result = result[0].numpy().decode()
print(result)

In [None]:
# import pickle

# weights = context_text_processor.get_weights()

# text_processor_weights = pickle.dump(weights, open("text_processor_weights.pkl", "wb"))


loaded_config = pickle.load(open("text_processor_config.pkl", 'rb'))
loaded_weights = pickle.load(open("text_processor_weights.pkl", 'rb'))
text_processor = tf.keras.layers.TextVectorization.from_config(loaded_config)
text_processor.set_weights(loaded_weights)

new_model = TextGen(UNITS, text_processor, text_processor)

In [None]:
new_model.load_weights(model_path)

In [None]:
result = new_model.gen(['Hello world'])
result = result[0].numpy().decode()
print(result)

In [65]:
def create_model(path_to_model, path_to_vectorizer_config, path_to_vectorizer_weights):
    loaded_config = pickle.load(open(path_to_vectorizer_config, 'rb'))
    loaded_weights = pickle.load(open(path_to_vectorizer_weights, 'rb'))

    text_processor = tf.keras.layers.TextVectorization.from_config(loaded_config)
    text_processor.set_weights(loaded_weights)

    new_model = TextGen(UNITS, text_processor, text_processor)

    return new_model




In [66]:
textGenModel = create_model(path_to_model=model_path, path_to_vectorizer_config='text_processor_config.pkl', path_to_vectorizer_weights='text_processor_weights.pkl')

In [67]:
textGenModel.gen(['Hello World'])

tf.Tensor(1, shape=(), dtype=int32)


<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'doubly last busy quirk had subtle doing really wondering , noses piled solution his most of see everybody im of see everybody im of see everybody whiskey the health about know depression thing alcoholic charles victims not poet poet very very poet very poet very poet very poet very poet'],
      dtype=object)>