In [1]:
#import tensorflow for text generation
import Twitter_API as api
import json
import numpy as np
import os
import time
import sys
import nltk
import re
import csv
import keras
import pickle

#stop words is used to remove words such as the, a, this etc.
from nltk.corpus import stopwords

import pandas as pd 
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.callbacks import LambdaCallback, ModelCheckpoint
from keras.preprocessing.text import Tokenizer
import random
import sys
import io

In [2]:
#Read the data into a dataframe
df = pd.read_csv("Data/popularTweets.csv")

#Renaming the columns to something more fitting and clean
df = df.rename(columns={'Tweet_Text': 'tweet'})

newDf = df[['tweet']]
#Drop empty rows
newDf = newDf.dropna()

#store the english stop words
#stopWords = stopwords.words('english')

#the tweet will be used for the input of our NLP model
tweets = newDf['tweet']

#remove https links
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

def cleanTweet(tweet):
    
    #convert tweet to lowercase
    tweet = tweet.lower()
    
    #removes http links
    tweet = pattern.sub(' ', tweet)
    
    #removes any character not alphabetic or numeric and keep #
    #tweet = re.sub(r"[^A-Za-z0-9]",' ', tweet)
    
    #return the tweet
    return tweet

tweets = tweets.apply(cleanTweet)

newDf['tweet'] = tweets

print(newDf['tweet'][0])

newDf.head()

today we express our deepest gratitude to all those who have served in our armed forces. #thankavet  


Unnamed: 0,tweet
0,today we express our deepest gratitude to all ...
1,busy day planned in new york. will soon be mak...
2,love the fact that the small groups of protest...
3,just had a very open and successful presidenti...
4,a fantastic day in d.c. met with president oba...


In [3]:
chars = sorted(list(set(''.join(tweets))))
print('total chars:', len(chars))

for c in chars[-19:]:
    tweets = tweets.str.replace(c,'')
    
chars = sorted(list(set(''.join(tweets))))
print('total chars:', len(chars))

newDf['tweet'] = tweets

print("data shape (rows,cols): ", newDf.shape)

newDf.head()

total chars: 80
total chars: 61
data shape (rows,cols):  (7375, 1)


  tweets = tweets.str.replace(c,'')


Unnamed: 0,tweet
0,today we express our deepest gratitude to all ...
1,busy day planned in new york. will soon be mak...
2,love the fact that the small groups of protest...
3,just had a very open and successful presidenti...
4,a fantastic day in d.c. met with president oba...


In [4]:
tweets = newDf.tweet

n_messages = len(tweets)
n_chars = len(' '.join(map(str, tweets)))

print(f'Count of tweets: {n_messages}')
print(f'Length of combined tweets: {n_chars} characters')

Count of tweets: 7375
Length of combined tweets: 783054 characters


In [5]:
sample_size = int(len(tweets) * 0.2)

tweets = tweets[:sample_size]
tweets = ' '.join(map(str, tweets)).lower()

#Look at first 250 characters in tweets
tweets[:160] 

'today we express our deepest gratitude to all those who have served in our armed forces. #thankavet   busy day planned in new york. will soon be making some ver'

In [6]:
unique = sorted(set(tweets))
print(f'{len(unique)} unique characters')

58 unique characters


In [7]:
#split into individual characters
chars = tf.strings.unicode_split(tweets, input_encoding='UTF-8')
chars

<tf.Tensor: shape=(149177,), dtype=string, numpy=array([b't', b'o', b'd', ..., b'd', b'!', b'!'], dtype=object)>

In [8]:
char_Ids = preprocessing.StringLookup(vocabulary=list(unique), mask_token=None)

#convert tokens to character IDs
ids = char_Ids(chars)
ids

<tf.Tensor: shape=(149177,), dtype=int64, numpy=array([51, 46, 35, ..., 35,  3,  3], dtype=int64)>

In [9]:
#try to get human words from representation
chars_Vocab_Ids = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=char_Ids.get_vocabulary(), invert=True, mask_token=None)

chars = chars_Vocab_Ids(ids)
chars

<tf.Tensor: shape=(149177,), dtype=string, numpy=array([b't', b'o', b'd', ..., b'd', b'!', b'!'], dtype=object)>

In [10]:
#convert characters back to strings
tf.strings.reduce_join(chars, axis=-1).numpy()

b'today we express our deepest gratitude to all those who have served in our armed forces. #thankavet   busy day planned in new york. will soon be making some very important decisions on the people who will be running our government! love the fact that the small groups of protesters last night have passion for our great country. we will all come together and be proud! just had a very open and successful presidential election. now professional protesters, incited by the media, are protesting. very unfair! a fantastic day in d.c. met with president obama for first time. really good meeting, great chemistry. melania liked mrs. o a lot! happy 241st birthday to the u.s. marine corps! thank you for your service!!   such a beautiful and important evening! the forgotten man and woman will never be forgotten again. we will all come together as never before watching the returns at 9:45pm.\n#electionnight #maga__   rt @ivankatrump: such a surreal moment to vote for my father for president of the 

In [11]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_Vocab_Ids(ids), axis=-1)

In [12]:
#make a dataset of the IDs
ids_dataset = tf.data.Dataset.from_tensor_slices(ids)

chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=char_Ids.get_vocabulary(), invert=True, mask_token=None)

#for each id in the dataset ID print out the character, only for the first 10 IDs
#for ids in ids_dataset.take(10):
    #print(chars_from_ids(ids).numpy().decode('utf-8'))

In [13]:
#create a sequence of 100
#this is done so when there is a sequence of certain words it has a target out put of certain words
#such as the word heart if it has the seuquence length of 4, the input would be hear and the output would be eart
seq_length = 100

examples_per_epoch = len(tweets)//(seq_length+1)

sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
      print(chars_from_ids(seq))

tf.Tensor(
[b't' b'o' b'd' b'a' b'y' b' ' b'w' b'e' b' ' b'e' b'x' b'p' b'r' b'e'
 b's' b's' b' ' b'o' b'u' b'r' b' ' b'd' b'e' b'e' b'p' b'e' b's' b't'
 b' ' b'g' b'r' b'a' b't' b'i' b't' b'u' b'd' b'e' b' ' b't' b'o' b' '
 b'a' b'l' b'l' b' ' b't' b'h' b'o' b's' b'e' b' ' b'w' b'h' b'o' b' '
 b'h' b'a' b'v' b'e' b' ' b's' b'e' b'r' b'v' b'e' b'd' b' ' b'i' b'n'
 b' ' b'o' b'u' b'r' b' ' b'a' b'r' b'm' b'e' b'd' b' ' b'f' b'o' b'r'
 b'c' b'e' b's' b'.' b' ' b'#' b't' b'h' b'a' b'n' b'k' b'a' b'v' b'e'
 b't' b' ' b' '], shape=(101,), dtype=string)


In [14]:
#for each seq in sequences print out the text
#so in this case we print out first 100 characters which are a sequence and print out that sequence of text
for seq in sequences.take(5):
    print(text_from_ids(seq).numpy())

b'today we express our deepest gratitude to all those who have served in our armed forces. #thankavet  '
b' busy day planned in new york. will soon be making some very important decisions on the people who wi'
b'll be running our government! love the fact that the small groups of protesters last night have passi'
b'on for our great country. we will all come together and be proud! just had a very open and successful'
b' presidential election. now professional protesters, incited by the media, are protesting. very unfai'


In [15]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

#when given a sequence and the first letter is removed output the proper response
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'today we express our deepest gratitude to all those who have served in our armed forces. #thankavet '
Target: b'oday we express our deepest gratitude to all those who have served in our armed forces. #thankavet  '


In [16]:
#Batch size
BATCH_SIZE = 64

#Buffer size to shuffle the dataset
BUFFER_SIZE = 10000

#split data into sequences and shuffle it and pack it into batches
dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [17]:
#Building the model
#Length of the vocabulary in chars
vocab_size = len(unique)

#The embedding dimension
embedding_dim = 256

#Number of RNN units
rnn_units = 1024

In [18]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
              return x, states
        else:
              return x

In [19]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(chars_Vocab_Ids.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [20]:
#Try the model
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 59) # (batch_size, sequence_length, vocab_size)


In [21]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  15104     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  60475     
                                                                 
Total params: 4,013,883
Trainable params: 4,013,883
Non-trainable params: 0
_________________________________________________________________


In [22]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

sampled_indices

array([ 8,  1, 30, 28, 14, 39, 52, 39, 34, 43, 37, 47, 29, 22, 26, 28, 41,
        2, 18, 21, 10, 21, 44, 20, 29,  5, 20, 58, 29, 10, 47, 30, 13, 18,
       25, 56,  1, 47, 26, 11, 15, 54, 32, 58, 14, 28, 23, 18,  1, 21,  3,
       33, 42, 28, 37, 58, 52, 45, 21, 30, 22, 33, 58,  6,  7, 36, 10,  2,
       24,  0, 48, 20, 36, 11,  5, 41, 42, 42, 57,  6, 14, 58, 35, 15, 35,
       10, 29,  0, 16, 37,  7,  0, 47, 53, 54, 24,  1, 45,  9,  5],
      dtype=int64)

In [23]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b' here:_   so terrible that crooked didnt report she got the debate questions from donna brazile, if '

Next Char Predictions:
 b'&\n@=.huhclfp?6:=j 25)5m4?#4{?)p@-29y\np:+/wa{.=72\n5!bk=f{un5@6b{$%e) 8[UNK]q4e+#jkkz$.{d/d)?[UNK]0f%[UNK]pvw8\nn(#'


In [24]:
#Training the model
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)

Prediction shape:  (64, 100, 59)  # (batch_size, sequence_length, vocab_size)
Mean loss:         4.0778017


In [25]:
tf.exp(mean_loss).numpy()

59.015594

In [26]:
model.compile(optimizer='adam', loss=loss)

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [27]:
EPOCHS = 20
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [28]:
class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars

        # Create a mask to prevent "[UNK]" from being generated.
        skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
        sparse_mask = tf.SparseTensor(
            # Put a -inf at each bad index.
            values=[-float('inf')]*len(skip_ids),
            indices=skip_ids,
            # Match the shape to the vocabulary
            dense_shape=[len(ids_from_chars.get_vocabulary())])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

    @tf.function
    def generate_one_step(self, inputs, states=None):
        # Convert strings to token IDs.
        input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
        input_ids = self.ids_from_chars(input_chars).to_tensor()

        # Run the model.
        # predicted_logits.shape is [batch, char, next_char_logits]
        predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
        # Only use the last prediction.
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits/self.temperature
        # Apply the prediction mask: prevent "[UNK]" from being generated.
        predicted_logits = predicted_logits + self.prediction_mask

        # Sample the output logits to generate token IDs.
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)

        # Convert from token ids to characters
        predicted_chars = self.chars_from_ids(predicted_ids)

        # Return the characters and model state.
        return predicted_chars, states

In [29]:
one_step_model = OneStep(model, chars_from_ids, char_Ids)

In [30]:
start = time.time()
states = None
next_char = tf.constant(['when'])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

whends now - i rane, hamp wonderfur spane! @hillaryclintons and more p.plecking by far (ouring of a mishase_ on @ani-tolica needs te aspmention thans thank you ale. my witcon, she is going to @sen the #vadeaga trump is unged by watch! funny, her 8?m. #sonctionsty in neat sefilly tstroy great! .@ringuq:     #vpyne speeco bear spending. the choige is my time on not. so been way bin gut the median voter.!!"lyoo   thank you doon on any way out of the rnc. #debate i viig in the midul enforcement monna, nobory abow. i was begon ampusing vireiniels. everying the fact!_   thank you fixwy. they are the doll fight, book:   jon! the govermm at the unfits trump. .@hillaryfbid cals bigges is i i am runith say the great for on 11/8, only reforms to get out and vote - thank you horribie in notional puth of you and se. at 7pm. #vedeyto make america great wad to one grout a forert under and will anifup that the mode strongly book! your kaine of years. wow, endidnes by goit ball
gets. hillary clinton to

In [31]:
start = time.time()
states = None
next_char = tf.constant(['beach'])
result = [next_char]

for n in range(160):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)

tf.Tensor([b'beach #bagt and of the mild, wo had america worke!   thank you america safe a movement offor do man.?-  #makeamericagreatagain\n  clinton #imikingrizhts me, af 7t tou'], shape=(1,), dtype=string) 

________________________________________________________________________________

Run time: 0.3291184902191162


In [32]:
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')





INFO:tensorflow:Assets written to: one_step\assets


INFO:tensorflow:Assets written to: one_step\assets


In [33]:
states = None
next_char = tf.constant(['I love'])
result = [next_char]

for n in range(140):
    next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
    result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))

I lover my compentine from puca sofe, in never should take dounn enjors virginia of .@hay ha. the only on truly trump country is fight for on-ith 


In [34]:
# Save the entire model as a SavedModel.
!mkdir -p saved_model
model.save('saved_model/my_model')

A subdirectory or file -p already exists.
Error occurred while processing: -p.
A subdirectory or file saved_model already exists.
Error occurred while processing: saved_model.


INFO:tensorflow:Assets written to: saved_model/my_model\assets


INFO:tensorflow:Assets written to: saved_model/my_model\assets
