<a href="https://colab.research.google.com/github/OnlyourMiracle/MachineLearning/blob/master/Course/PythonMachineLearning/PythonMachineLerning_U16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget -P /content/drive/MyDrive/MLIA/Data https://github.com/OnlyourMiracle/Python-Machine-Learning-Second-Edition/blob/master/Chapter16/movie_data.csv.gz
!wget -P /content/drive/MyDrive/MLIA/Data https://github.com/OnlyourMiracle/Python-Machine-Learning-Second-Edition/blob/master/Chapter16/pg2265.txt

--2022-11-16 12:42:34--  https://github.com/OnlyourMiracle/Python-Machine-Learning-Second-Edition/blob/master/Chapter16/pg2265.txt
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘/content/drive/MyDrive/MLIA/Data/pg2265.txt’

pg2265.txt              [ <=>                ]   1.47M  --.-KB/s    in 0.05s   

2022-11-16 12:42:34 (27.4 MB/s) - ‘/content/drive/MyDrive/MLIA/Data/pg2265.txt’ saved [1539648]



In [None]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [None]:
!pip install pyprind
import pyprind
import pandas as pd
from string import punctuation
import re
import numpy as np

df = pd.read_csv('/content/drive/MyDrive/MLIA/Data/movie_data.csv')
print(df.head(3))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyprind
  Downloading PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)
Installing collected packages: pyprind
Successfully installed pyprind-2.11.3
                                              review  sentiment
0  I am surprised that there is confusion over th...          1
1  Had I known to what I was submitting myself, I...          0
2  i didn't enjoy this movie at all.for one,i jus...          0


In [None]:
#preprocessing the data: separate words and count each word's occurrence

from collections import Counter

counts = Counter()
pbar = pyprind.ProgBar(len(df['review']), title='Counting words occurences')

for i, review in enumerate(df['review']):
  text = ''.join([c if c not in punctuation else ' '+c+' ' for c in review]).lower()
  df.loc[i, 'review'] = text
  pbar.update()
  counts.update(text.split())

Counting words occurences
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:45


In [None]:
#Create a mapping: Map each unique word to an integer

word_counts = sorted(counts, key=counts.get, reverse=True)
print(word_counts[:5])
word_to_int = {word: ii for ii, word in enumerate(word_counts, 1)}

mapped_reviews = []
pbar = pyprind.ProgBar(len(df['review']), title='Map reviews to ints')

for review in df['review']:
  mapped_reviews.append([word_to_int[word] for word in review.split()])
  pbar.update()

Map reviews to ints


['the', '.', ',', 'and', 'a']


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:02


In [None]:
#define fixed-length sequences: use the last 200 elements of each sequence. if sequence length < 200: left-pad with zeros
sequence_length = 200
sequences = np.zeros((len(mapped_reviews), sequence_length), dtype=int)
for i, row in enumerate(mapped_reviews):
  review_arr = np.array(row)
  sequences[i, -len(row):] = review_arr[-sequence_length:]

x_train = sequences[:25000, :]
y_train = df.loc[:25000, 'sentiment'].values
x_test = sequences[25000:, :]
y_test = df.loc[25000:, 'sentiment'].values

np.random.seed(123)

#function to generate minibatches
def create_batch_generator(x, y=None, batch_size=64):
  n_batches = len(x)//batch_size 
  x = x[:n_batches*batch_size]
  if y is not None:
    y = y[:n_batches*batch_size]
  for ii in range(0, len(x), batch_size):
    if y is not None:
      yield x[ii:ii+batch_size], y[ii:ii+batch_size]
    else:
      yield x[ii:ii+batch_size]

In [None]:
class SentimentRNN(object):
  def __init__(self, n_words, seq_len=200, lstm_size=256, num_layers=1, batch_size=64, learning_rate=0.0001, embed_size=200):
    self.n_words = n_words
    self.seq_len = seq_len
    self.lstm_size = lstm_size
    self.num_layers = num_layers
    self.batch_size = batch_size
    self.learning_rate = learning_rate
    self.embed_size = embed_size 

    self.g = tf.Graph()
    with self.g.as_default():
      tf.set_random_seed= 123
      self.build()
      self.saver = tf.train.Saver()
      self.init_op = tf.global_variables_initializer()

  def build(self):
    #define the placeholder
    tf_x = tf.placeholder(tf.int32, shape=(self.batch_size, self.seq_len), name='tf_x')
    tf_y = tf.placeholder(tf.float32, shape=(self.batch_size), name='tf_y')
    tf_keepprob = tf.placeholder(tf.float32, name='tf_keepprob')

    #create the embedding layer
    embedding = tf.Variable(tf.random_uniform((self.n_words, self.embed_size), minval=-1, maxval=1), name='embedding')
    embed_x = tf.nn.embedding_lookup(embedding, tf_x, name='embeded_x')

    #define LSTM cell and stack them together
    cells = tf.compat.v1.nn.rnn_cell.MultiRNNCell([tf.compat.v1.nn.rnn_cell.DropoutWrapper(tf.compat.v1.nn.rnn_cell.BasicLSTMCell(self.lstm_size), output_keep_prob=tf_keepprob) for i in range(self.num_layers)])

    #define the initial state:
    self.initial_state = cells.zero_state(self.batch_size, tf.float32)
    print(' << initial state>> ', self.initial_state)
    lstm_outputs, self.final_state = tf.nn.dynamic_rnn(cells, embed_x, initial_state=self.initial_state)

    #note:lstm_outputs shape:[batch_size, max_time, cells.output_size]
    print('\n << lstm output >> ', lstm_outputs)
    print('\n << final state >> ', self.final_state)

    #apply as fc layer after on top of RNN output:
    logits = tf.layers.dense(inputs=lstm_outputs[:, -1], units=1, activation=None, name='logits')
    logits = tf.squeeze(logits, name='logits_squeezed')
    print('\n << logits  >> ', logits)

    y_proba = tf.nn.sigmoid(logits, name='probabilities')
    predictions = {
        'probabilities': y_proba, 
        'labels':tf.cast(tf.round(y_proba), tf.int32, name='labels')
    }
    print('\m << predictions  >> ', predictions)

    #define the cost function
    cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_y, logits=logits), name='cost')

    #define the optimizer
    optimizer = tf.train.AdamOptimizer(self.learning_rate)
    train_op = optimizer.minimize(cost, name='train_op')
  
  def train(self, x_train, y_train, num_epochs):
    with tf.Session(graph=self.g) as sess:
      sess.run(self.init_op)
      iteration = 1
      for epoch in range(num_epochs):
        state = sess.run(self.initial_state)
        for batch_x, batch_y in create_batch_generator(x_train, y_train, self.batch_size):
          feed = {'tf_x:0':batch_x, 'tf_y:0':batch_y, 'tf_keepprob:0':0.5, self.initial_state:state}
          loss, _ , state = sess.run(['cost:0', 'train_op', self.final_state], feed_dict=feed)

          if iteration % 20 == 0:
            print("Epoch: %d/%d Iteration: %d ""| Train loss: %.5f" % (epoch + 1, num_epochs, iteration, loss))
          
          iteration += 1
        if (epoch+1) % 10 == 0:
          self.saver.save(sess, "model/sentiment-%d.ckpt" % epoch)

  def predict(self, x_data, return_prob=False):
    preds = []
    with tf.Session(graph=self.g) as sess:
      self.saver.restore(sess, tf.train.latest_checkpoint('model/'))
      test_state = sess.run(self.initial_state)
      for ii, batch_x in enumerate(create_batch_generator(x_data, None, batch_size=self.batch_size), 1):
        feed = {'tf_x:0':batch_x, 'tf_keepprob:0': 1.0, self.initial_state:test_state}
        if return_prob:
          pred, test_state = sess.run(['probabilities:0', self.final_state], feed_dict=feed)
        else:
          pred, test_state = sess.run(['labels:0', self.final_state], feed_dict=feed)
        preds.append(pred)

    return np.concatenate(preds)


In [None]:
#build a SentimentRNN example

n_words = max(list(word_to_int.values())) + 1
rnn = SentimentRNN(n_words=n_words, seq_len=sequence_length, embed_size=256, lstm_size=128, num_layers=1, batch_size=100, learning_rate=0.001)

#train
rnn.train(x_train, y_train, num_epochs=40)

#predict
preds = rnn.predict(x_test)
y_true = y_test[:len(preds)]
print('Test Acc: %.3f' % (np.sum(preds==y_true) / len(y_true)))

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


 << initial state>>  (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(100, 128) dtype=float32>),)

 << lstm output >>  Tensor("rnn/transpose_1:0", shape=(100, 200, 128), dtype=float32)

 << final state >>  (LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(100, 128) dtype=float32>),)

 << logits  >>  Tensor("logits_squeezed:0", shape=(100,), dtype=float32)
\m << predictions  >>  {'probabilities': <tf.Tensor 'probabilities:0' shape=(100,) dtype=float32>, 'labels': <tf.Tensor 'labels:0' shape=(100,) dtype=int32>}
Epoch: 1/40 Iteration: 20 | Train loss: 0.68579
Epoch: 1/40 Iteration: 40 | Train loss: 0.65783
Epoch: 1/40 Iteration: 60 | Train loss: 0.65085
Epoch: 1/40 Iteration: 80 | Train loss: 0.65397
Epoch: 1/40 Iteration: 10

In [None]:
!sudo curl -L http://www.gutenberg.org/cache/epub/2265/pg2265.txt -o /content/drive/MyDrive/MLIA/Data

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0  180k    0  1139    0     0   9112      0  0:00:20 --:--:--  0:00:20  9112
curl: (23) Failed writing body (0 != 1139)


In [None]:
import numpy as np

#Reading and processing text
with open('/content/drive/MyDrive/MLIA/Data/pg2265.txt', 'r', encoding='utf-8') as f:
  text = f.read()

text = text[15858:]
chars = set(text)
char2int = {ch:i for i, ch in enumerate(chars)}
int2char = dict(enumerate(chars))
text_ints = np.array([char2int[ch] for ch in text], dtype=np.int32)

In [None]:
def reshape_data(sequence, batch_size, num_steps):
  tot_batch_length = batch_size * num_steps
  num_batches = int(len(sequence) / tot_batch_length)
  if num_batches*tot_batch_length + 1 > len(sequence):
    num_batches = num_batches - 1
  
  #truncate the sequence at the end to get rid of remaining charcaters that do not make a full batch
  x = sequence[0:num_batches*tot_batch_length]
  y = sequence[1:num_batches*tot_batch_length + 1]

  #split x & y into a list batches of sequences
  x_batch_splits = np.split(x, batch_size)
  y_batch_splits = np.split(y, batch_size)

  #Stack the batches together
  #batch_size x tot_batch_length
  x = np.stack(x_batch_splits)
  y = np.stack(y_batch_splits)

  return x, y 

#Testing
train_x, train_y = reshape_data(text_ints, 64, 10)
print(train_x.shape)
print(train_x[0, :10])
print(train_y[0, :10])
print(''.join(int2char[i] for i in train_x[0, :50]))

(64, 23800)
[87 12 69 92 34 16 64 25 43 84]
[12 69 92 34 16 64 25 43 84 12]
 &quot;key&quot;: &quot;click.take_survey&quot;}, 


In [None]:
np.random.seed(123)

def create_batch_generator(data_x, data_y, num_steps):
  batch_size, tot_batch_length = data_x.shape
  num_batches = int(tot_batch_length/num_steps)
  for b in range(num_batches):
    yield (data_x[:, b*num_steps:(b+1)*num_steps],
           data_y[:, b*num_steps:(b+1)*num_steps])

bgen = create_batch_generator(train_x[:, :100], train_y[:, :100], 15)
for b in bgen:
  print(b[0].shape, b[1].shape, end=' ')
  print(''.join(int2char[i] for i in b[0][0, :]).replace('\n', '*'), ' ',
        ''.join(int2char[i] for i in b[1][0, :]).replace('\n', '*'))
  

(64, 15) (64, 15)  &quot;key&quot   &quot;key&quot;
(64, 15) (64, 15) ;: &quot;click.   : &quot;click.t
(64, 15) (64, 15) take_survey&quo   ake_survey&quot
(64, 15) (64, 15) t;}, {&quot;exp   ;}, {&quot;expe
(64, 15) (64, 15) erimentIds&quot   rimentIds&quot;
(64, 15) (64, 15) ;: [], &quot;id   : [], &quot;id&


In [None]:
#build the character-levell RNN model
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import os

class CharRNN(object):
  def __init__(self, num_classes, batch_size=64, num_steps=100, lstm_size=128, num_layers=1, learning_rate=0.001, keep_prob=0.5, grad_clip=5, 
               sampling=False):
    self.num_classes = num_classes
    self.batch_size = batch_size
    self.lstm_size = lstm_size
    self.num_layers = num_layers 
    self.learning_rate = learning_rate
    self.keep_prob = keep_prob
    self.grad_clip = grad_clip
    self.num_steps = num_steps

    self.g = tf.Graph()
    with self.g.as_default():
      tf.set_random_seed(123)

      self.build(sampling=sampling)
      self.saver = tf.train.Saver()
      self.init_op = tf.global_variables_initializer()

  def build(self, sampling):
    if sampling == True:
      batch_size, num_steps = 1, 1
    else:
      batch_size = self.batch_size
      num_steps = self.num_steps
    
    tf_x = tf.placeholder(tf.int32, shape=[batch_size, num_steps], name='tf_x')
    tf_y = tf.placeholder(tf.int32, shape=[batch_size, num_steps], name='tf_y')
    tf_keepprob = tf.placeholder(tf.float32, name='tf_keepprob')

    #one-hot encoding
    x_onehot = tf.one_hot(tf_x, depth=self.num_classes)
    y_onehot = tf.one_hot(tf_y, depth=self.num_classes)

    #build the multi-layer RNN cells
    cells = tf.compat.v1.nn.rnn_cell.MultiRNNCell(
        [tf.compat.v1.nn.rnn_cell.DropoutWrapper(
            tf.compat.v1.nn.rnn_cell.BasicLSTMCell(self.lstm_size),
            output_keep_prob=tf_keepprob)
        for _ in range(self.num_layers)])
    
    #define the initial state
    self.initial_state = cells.zero_state(batch_size, tf.float32)

    #run each sequence step through the RNN
    lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
        cells, x_onehot, initial_state=self.initial_state
    )

    print('<< lstm_outputs >>', lstm_outputs)

    seq_output_reshaped = tf.reshape(
        lstm_outputs, shape=[-1, self.lstm_size], name='seq_output_reshaped'
    )       

    logits = tf.layers.dense(
        inputs=seq_output_reshaped, units = self.num_classes, activation=None, name='logits'
    )

    proba = tf.nn.softmax(
        logits, name='probabilities'
    )
    print(proba)

    y_reshaped = tf.reshape(y_onehot, shape=[-1, self.num_classes], name='y_reshaped')
    cost = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(
            logits=logits,
            labels=y_reshaped
        ),name='cost'
        )
    
    #gradient clipping to avoid 'exploding gradients'
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(
        tf.gradients(cost, tvars), 
        self.grad_clip
    )
    optimizer = tf.train.AdamOptimizer(self.learning_rate)
    train_op = optimizer.apply_gradients(
        zip(grads, tvars),
        name='train_op'
    )

  def train(self, train_x, train_y, num_epochs, ckpt_dir='./model/'):
    #create the checkpoint directory if does not exists
    if not os.path.exists(ckpt_dir):
      os.mkdir(ckpt_dir)
    
    with tf.Session(graph=self.g) as sess:
      sess.run(self.init_op)

      n_batches = int(train_x.shape[1]/self.num_steps)
      iterations = n_batches * num_epochs
      for epoch in range(num_epochs):
        #Train network
        new_state = sess.run(self.initial_state)
        loss = 0

        #Minibatch generator
        bgen = create_batch_generator(
            train_x, train_y, self.num_steps
        )
        for b, (batch_x, batch_y) in enumerate(bgen, 1):
          iteration = epoch*n_batches + b
          
          feed = {'tf_x:0':batch_x, 
                  'tf_y:0':batch_y,
                  'tf_keepprob:0':self.keep_prob, 
                  self.initial_state:new_state}
          batch_cost, _, new_state = sess.run(['cost:0', 'train_op', self.final_state], feed_dict=feed)
          if iteration % 10 == 0:
            print('Epoch %d/%d Iteration %d'
                  '| Training loss: %.4f' % (
                  epoch + 1, num_epochs, 
                  iteration, batch_cost))
        
        #save the trained model
        self.saver.save(sess, os.path.join(ckpt_dir, 'language_modeling.ckpt'))

  def sample(self, output_length, ckpt_dir, starter_seq='The '):
    observed_seq = [ch for ch in starter_seq]
    with tf.Session(graph=self.g) as sess:
      self.saver.restore(sess, tf.train.latest_checkpoint(ckpt_dir))

      #1. run the model using the starter sequence 
      new_state = sess.run(self.initial_state)
      for ch in starter_seq:
        x = np.zeros((1,1))
        x[0, 0] = char2int[ch]
        feed = {'tf_x:0':x, 
                'tf_keepprob':1.0, 
                self.initial_state:new_state}
        proba, new_state = sess.run(['probabilities:0', self.final_state], feed_dict=feed)
        ch_id = get_top_char(proba, len(chars))

        #2. run the model using the updated observed_seq
        for i in range(output_length):
          x[0, 0] = ch_id
          feed = {'tf_x:0':x, 'tf_keepprob:0':1.0, self.initial_state:new_state}
          proba, new_state =sess.run(['probabilities:0', self.final_state], feed_dict = feed)
          ch_id = get_top_char(proba, len(chars))
          observed_seq.append(int2char[ch_id])
    return ''.join(observed_seq)
  
def get_top_char(probas, char_size, top_n=5):
  p = np.squence(probas)
  p[np.argsort(p)[:-top_n]] = 0.0
  p = p / np.sum(p)
  ch_id = np.random.choice(char_size, 1, p=p)[0]
  return ch_id

batch_size = 64
num_steps = 100
train_x, train_y = reshape_data(text_ints, batch_size, num_steps)

rnn = CharRNN(num_classes=len(chars), batch_size=batch_size)
rnn.train(train_x, train_y, num_epochs=100, ckpt_dir='./model-100/')


Instructions for updating:
non-resource variables are not supported in the long term
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



<< lstm_outputs >> Tensor("rnn/transpose_1:0", shape=(64, 100, 128), dtype=float32)
Tensor("probabilities:0", shape=(6400, 96), dtype=float32)
Epoch 1/100 Iteration 10| Training loss: 4.0818
Epoch 1/100 Iteration 20| Training loss: 3.5514
Epoch 1/100 Iteration 30| Training loss: 3.4178
Epoch 1/100 Iteration 40| Training loss: 3.2891
Epoch 1/100 Iteration 50| Training loss: 3.1327
Epoch 1/100 Iteration 60| Training loss: 3.0266
Epoch 1/100 Iteration 70| Training loss: 3.0153
Epoch 1/100 Iteration 80| Training loss: 2.8987
Epoch 1/100 Iteration 90| Training loss: 2.8205
Epoch 1/100 Iteration 100| Training loss: 2.7154
Epoch 1/100 Iteration 110| Training loss: 2.6061
Epoch 1/100 Iteration 120| Training loss: 2.5500
Epoch 1/100 Iteration 130| Training loss: 2.5002
Epoch 1/100 Iteration 140| Training loss: 2.3152
Epoch 1/100 Iteration 150| Training loss: 2.3309
Epoch 1/100 Iteration 160| Training loss: 2.2595
Epoch 1/100 Iteration 170| Training loss: 2.2029
Epoch 1/100 Iteration 180| Traini

In [None]:
#get result
del rnn

np.random.seed(123)
rnn = CharRNN(len(chars), sampling=True)

print(rnn.sample(ckpt_dir='./model-100/', output_length=500))
