# Imports

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

tf.__version__

'2.3.0'

In [2]:
######## GPU CONFIGS FOR RTX 2070 ###############
## Please ignore if not training on GPU       ##
## this is important for running CuDNN on GPU ##

tf.keras.backend.clear_session() #- for easy reset of notebook state

# chck if GPU can be seen by TF
tf.config.list_physical_devices('GPU')
#tf.debugging.set_log_device_placement(True)
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.experimental.set_memory_growth(gpus[0], True)
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)
###############################################

1 Physical GPUs, 1 Logical GPU


# Setup Tokenization

In [3]:
chars = sorted(set("abcdefghijklmnopqrstuvwxyz0123456789 -,;.!?:’’’/\|_@#$%ˆ&*˜‘+-=()[]{}' ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
chars = list(chars)
EOS = '<EOS>'
UNK = "<UNK>"
PAD = "<PAD>"  # need to move mask to '0'index for TF
chars.append(UNK)
chars.append(EOS)  #end of sentence


## need to handle padding characters as well
chars.insert(0, PAD)  # now padding should get index of 0

In [4]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(chars)}
idx2char = np.array(chars)

In [5]:
def char_idx(c):
    # takes a character and returns an index
    # if character is not in list, returns the unknown token
    if c in chars:
        return char2idx[c]
    
    return char2idx[UNK]

# Load the Model

In [6]:
# Length of the vocabulary in chars
vocab_size = len(chars)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

# Batch size
BATCH_SIZE=1

In [7]:
# Define the model
# this one is without padding masking or dropout layer
def build_gen_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model


gen_model = build_gen_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

In [10]:
# Now setup the location of the checkpoint
# and load the latest checkpoint
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints/2020-Oct-01-14-29-55' 

gen_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

gen_model.build(tf.TensorShape([1, None]))

In [11]:
def generate_text(model, start_string, temperature=0.7, num_generate=75):
  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Here batch size == 1
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)
        
      text_generated.append(idx2char[predicted_id])
      # lets break is <EOS> token is generated
      #if idx2char[predicted_id] == EOS:
      #  break #end of a sentence reached, lets stop

  return (start_string + ''.join(text_generated))

In [12]:
print(generate_text(gen_model, start_string=u"Obama"))

Obama has been a real business model<EOS><PAD>ill consoles in Chin<EOS><PAD>illing New Corvette 


In [16]:
print(generate_text(gen_model, start_string=u"Apple"))

Apple offers early deal to buy Twitc<EOS><PAD>PHOTOS: Andi Dorfman and Josh Murray Confi


## 0.6993 model

In [67]:
print(generate_text(gen_model, start_string=u"Google"))

Google plans to release the Xbox One vs. Samsung Galaxy Gea<EOS><PAD>ote on Mother's Day 


In [102]:
print(generate_text(gen_model, start_string=u"S&P"))

S&P closes above 2000 for first tim<EOS><PAD>opes and Company Reports Safe Bloomber<EOS><PAD>a


In [70]:
print(generate_text(gen_model, start_string=u"Market"))

Markets likely to be out of the world in decline in Asian market<EOS><PAD>et review: Spri


In [107]:
print(generate_text(gen_model, start_string=u"Beyonce"))

Beyonce & Jay Z Get Married<EOS><PAD>ock convertible in power outage from Boeing 78<EOS><PAD>ells 


## 0.7031 model

In [72]:
print(generate_text(gen_model, start_string=u"Apple"))

Apple iPad Mini 2 Release Date <UNK> Specs, Price and Specs Compariso<EOS><PAD>oul watched '


In [73]:
print(generate_text(gen_model, start_string=u"Google"))

Google Glass in the Big Sal<EOS><PAD>ouldebrayes a mont<EOS><PAD>ackike transforms bone of heart 


In [79]:
print(generate_text(gen_model, start_string=u"S&P"))

S&P 500 stock market slide<EOS><PAD>acking US court ruling in talks to start probe int


In [80]:
print(generate_text(gen_model, start_string=u"Market"))

Markets on edge ahead of Yellen testimon<EOS><PAD>illionaire hackers are still sued over 


In [82]:
print(generate_text(gen_model, start_string=u"Beyonce"))

Beyonce & Jay Z Are The Most Short Shorts (PHOTOS<EOS><PAD>ick of the day for more than 30


## Different temperature settings with the same model

In [108]:
print(generate_text(gen_model, start_string=u"S&P", temperature=0.1))

S&P 500 Closes Above 1900 For First Tim<EOS><PAD>och In Latest News From The Amazon Fi


In [109]:
print(generate_text(gen_model, start_string=u"S&P", temperature=0.3))

S&P Close to $5.7 Billion Deal to Buy Beats Electronic<EOS><PAD>ammed by Michael Jacks


In [121]:
print(generate_text(gen_model, start_string=u"S&P", temperature=0.5))

S&P 500 closes above 2000 for the first time in Iranian cour<EOS><PAD>arry Yahoo's Mac


In [119]:
print(generate_text(gen_model, start_string=u"S&P", temperature=0.7))

S&P 500 earnings dip as multiple shipments trigger no brakes, will be exclusiv


In [112]:
print(generate_text(gen_model, start_string=u"S&P", temperature=0.9))

S&P, Ack Factors at Risk of what you see This Ma<EOS><PAD>osem, here are the biggest l


In [113]:
print(generate_text(gen_model, start_string=u"Kim", temperature=0.9))

Kim Kardashian's en<EOS><PAD>ammelification Donations May Affect What's Need<EOS><PAD>adi and 


In [114]:
print(generate_text(gen_model, start_string=u"Kim", temperature=0.7))

Kim Kardashian Received To Release Guardians Of The Day: Go Power<EOS><PAD>amadian & S


In [115]:
print(generate_text(gen_model, start_string=u"Kim", temperature=0.5))

Kim Kardashian Wedding Dress Dress In The Works From Fia<EOS><PAD>ost Marketing of Gen


In [116]:
print(generate_text(gen_model, start_string=u"Kim", temperature=0.3))

Kim Kardashian Shares Her Best And Worst Of His First Look At The Met Gala<EOS><PAD>am


In [117]:
print(generate_text(gen_model, start_string=u"Kim", temperature=0.1))

Kim Kardashian and Kanye West wedding photos release<EOS><PAD>arriage Depictions of Be


## in progress model

In [9]:
gen_model2 = build_gen_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)
checkpoint_dir = './training_checkpoints/'+ '2020-Jun-02-22-38-17'  # -> with 0.6993 loss
#''2020-Jun-02-01-02-14' # -> .7031 loss
gen_model2.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

gen_model2.build(tf.TensorShape([1, None]))

In [17]:
print(generate_text(gen_model2, start_string=u"S&P", 
                    temperature=1, num_generate=75))

S&P more ahead of your emai<EOS><PAD>or Republicans Delay Drunk In Park Time Hi<EOS><PAD>H loo


In [162]:
print(generate_text(gen_model2, start_string=u"S", temperature=0.7))

Star Wars Episode VII cast reunite<EOS><PAD>or and underground on The Amazon-Hachett


In [163]:
print(generate_text(gen_model2, start_string=u"NBA", temperature=0.4))

NBA planning a whole new level of passenger on programming languag<EOS><PAD>orted: Rob


In [167]:
print(generate_text(gen_model2, start_string=u"Sta", temperature=0.7))

Start Apps for Business War and Apple (AAPL<EOS><PAD>ortal to meet deadly virus in Lib


# Greedy Search with Bigrams

In [173]:
!wget http://norvig.com/tsv/ngrams-all.tsv.zip

--2020-06-06 22:04:34--  http://norvig.com/tsv/ngrams-all.tsv.zip
Resolving norvig.com... 158.106.138.13
Connecting to norvig.com|158.106.138.13|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10950626 (10M) [application/zip]
Saving to: ‘ngrams-all.tsv.zip’


2020-06-06 22:04:41 (1.42 MB/s) - ‘ngrams-all.tsv.zip’ saved [10950626/10950626]



In [174]:
# gzip file masquerading as a ZIP file
!mv ngrams-all.tsv.zip ngrams-all.tsv.z
!gunzip ngrams-all.tsv.z

In [175]:
# First 27 rows are characters
# next 669 are bigrams
!head -n 697 ngrams-all.tsv | tail -n 669 > bigrams.tsv

In [190]:
# file format col1: bigram, col2: overall freq, ignore other cols
from collections import Counter
import csv
counts = Counter()
bitree = {}
totals = 2819662855499
with open("bigrams.tsv", 'r') as fl:
    big = csv.reader(fl, delimiter='\t')
    for bigram in big:
        key = bigram[0]
        count = int(bigram[1]) / totals 
        counts[key] = count 
        if(key[0] in bitree):
            bitree[key[0]][key[1]] = count
        else:
            #need new subdict
            bitree[key[0]] = { key[1] : count }

In [191]:
counts.most_common(10)

[('TH', 0.035562033867788265),
 ('HE', 0.030747412428375958),
 ('IN', 0.024327452898924896),
 ('ER', 0.020482648127014873),
 ('AN', 0.019851510793865493),
 ('RE', 0.018543231910521065),
 ('ON', 0.01758046422760261),
 ('AT', 0.014867323009999081),
 ('EN', 0.014542484564787907),
 ('ND', 0.013522814458699572)]

In [192]:
print(bitree['T'])

{'H': 0.035562033867788265, 'I': 0.013425788170089764, 'E': 0.012048696340679248, 'O': 0.010412665302782832, 'A': 0.005298860706648515, 'R': 0.004258201782026581, 'S': 0.0033748821262235386, 'U': 0.0025490671946763704, 'Y': 0.0022727710093077376, 'T': 0.0017068330270812787, 'L': 0.0009844918163128189, 'W': 0.0008237223232097947, 'M': 0.0002647908857415045, 'C': 0.00026136282448192903, 'N': 0.0001001065175042166, 'F': 5.661194659804483e-05, 'P': 4.294463459127656e-05, 'Z': 3.8489544871772874e-05, 'B': 2.544494525651471e-05, 'G': 1.969131766647826e-05, 'D': 1.2958822338897517e-05, 'V': 1.1634636011898067e-05, 'K': 4.639558582150014e-06, 'X': 1.1806014302411626e-06, 'J': 1.1241265932976448e-06, 'Q': 8.928017741874646e-07}


In [None]:
!pip install anytree

In [356]:
from anytree import Node, RenderTree
# construct 5 letter word o given a bigram
start = 'WI'
compl = Node(start[0], prob=1)  # to store comlpetions and probabilities
cnt = 0

def recurse(letter, prob, level, parent):
    if level > 2:
        return Node(letter, parent=parent, prob=prob*parent.prob)
    
    items = Counter(bitree[letter]).most_common(3)
    nd = Node(letter, parent=parent, prob=parent.prob*prob)
    for item in items:
        rslt = recurse(item[0], item[1], level+1, nd)
    return nd


recurse(start[1], 1, 0, compl)
    

Node('/W/I', prob=1)

In [357]:
for pre, fill, node in RenderTree(compl):
    print("%s%s (%2.8f)" % (pre, node.name, node.prob))

W (1.00000000)
└── I (1.00000000)
    ├── N (0.02432745)
    │   ├── D (0.00032898)
    │   │   ├── E (0.00000252)
    │   │   ├── I (0.00000162)
    │   │   └── O (0.00000062)
    │   ├── T (0.00025331)
    │   │   ├── H (0.00000901)
    │   │   ├── I (0.00000340)
    │   │   └── E (0.00000305)
    │   └── G (0.00023184)
    │       ├── E (0.00000089)
    │       ├── H (0.00000053)
    │       └── R (0.00000046)
    ├── S (0.01128430)
    │   ├── T (0.00011888)
    │   │   ├── H (0.00000423)
    │   │   ├── I (0.00000160)
    │   │   └── E (0.00000143)
    │   ├── E (0.00010518)
    │   │   ├── R (0.00000215)
    │   │   ├── N (0.00000153)
    │   │   └── S (0.00000141)
    │   └── I (0.00006207)
    │       ├── N (0.00000151)
    │       ├── S (0.00000070)
    │       └── T (0.00000070)
    └── T (0.01123274)
        ├── H (0.00039946)
        │   ├── E (0.00001228)
        │   ├── A (0.00000370)
        │   └── I (0.00000305)
        ├── I (0.00015081)
        │   ├── N (0.00000367)