In [42]:
#import tensorflow for text generation
import Twitter_API as api
import json
import numpy as np
import os
import time
import sys
import nltk
import re
import csv
import keras
import pickle

#stop words is used to remove words such as the, a, this etc.
from nltk.corpus import stopwords

import pandas as pd 
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.callbacks import LambdaCallback, ModelCheckpoint
from keras.preprocessing.text import Tokenizer
import random
import sys
import io

In [2]:
#Read the data into a dataframe
df = pd.read_csv("Data/Twitter.csv")

#Renaming the columns to something more fitting and clean
df = df.rename(columns={'clean_text': 'tweet', 'category': 'sentiment'})
#df = df.rename(columns={'content': 'tweet', 'author': 'user','number_of_likes': 'likes', 'number_of_shares': 'shares'})

#remove all non english tweets
#df = df.drop(df.index[df['language']!='en'])

#Drop empty rows
df = df.dropna()

#store the english stop words
stopWords = stopwords.words('english')

#the tweet will be used for the input of our NLP model
tweets = df['tweet']

#remove https links
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

def cleanTweet(tweet):
    
    #convert tweet to lowercase
    tweet = tweet.lower()
    
    #removes any character not alphabetic or numeric
    tweet = pattern.sub('', tweet)
    
    #removes any character not alphabetic or numeric
    tweet = re.sub(r"[^A-Za-z0-9]",' ', tweet)
    
    #return the tweet
    return tweet

tweets = tweets.apply(cleanTweet)

df['tweet'] = tweets

df.head()

Unnamed: 0,tweet,sentiment
0,when modi promised minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,assualt john,-1.0
3,what did just say vote for modi welcome bjp t...,1.0
4,asking his supporters prefix chowkidar their n...,1.0


In [3]:
tweets = df.tweet

n_messages = len(tweets)
n_chars = len(' '.join(map(str, tweets)))

print(f'Count of tweets: {n_messages}')
print(f'Length of combined tweets: {n_chars} characters')

Count of tweets: 162974
Length of combined tweets: 20400257 characters


In [4]:
sample_size = int(len(tweets) * 0.2)

tweets = tweets[:sample_size]
tweets = ' '.join(map(str, tweets)).lower()

#Look at first 250 characters in tweets
tweets[:250] 

'when modi promised  minimum government maximum governance  expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples talk all the nonsense and continue all '

In [5]:
unique = sorted(set(tweets))
print(f'{len(unique)} unique characters')

37 unique characters


In [6]:
#split into individual characters
chars = tf.strings.unicode_split(tweets, input_encoding='UTF-8')
chars

<tf.Tensor: shape=(4171059,), dtype=string, numpy=array([b'w', b'h', b'e', ..., b'd', b'i', b'a'], dtype=object)>

In [7]:
char_Ids = preprocessing.StringLookup(vocabulary=list(unique), mask_token=None)

#convert tokens to character IDs
ids = char_Ids(chars)
ids

<tf.Tensor: shape=(4171059,), dtype=int64, numpy=array([34, 19, 16, ..., 15, 20, 12], dtype=int64)>

In [8]:
#try to get human words from representation
chars_Vocab_Ids = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=char_Ids.get_vocabulary(), invert=True, mask_token=None)

chars = chars_Vocab_Ids(ids)
chars

<tf.Tensor: shape=(4171059,), dtype=string, numpy=array([b'w', b'h', b'e', ..., b'd', b'i', b'a'], dtype=object)>

In [9]:
#convert characters back to strings
tf.strings.reduce_join(chars, axis=-1).numpy()



In [10]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_Vocab_Ids(ids), axis=-1)

In [11]:
#make a dataset of the IDs
ids_dataset = tf.data.Dataset.from_tensor_slices(ids)

chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=char_Ids.get_vocabulary(), invert=True, mask_token=None)

#for each id in the dataset ID print out the character, only for the first 10 IDs
#for ids in ids_dataset.take(10):
    #print(chars_from_ids(ids).numpy().decode('utf-8'))

In [12]:
#create a sequence of 100
#this is done so when there is a sequence of certain words it has a target out put of certain words
#such as the word heart if it has the seuquence length of 4, the input would be hear and the output would be eart
seq_length = 100

examples_per_epoch = len(tweets)//(seq_length+1)

sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
      print(chars_from_ids(seq))

tf.Tensor(
[b'w' b'h' b'e' b'n' b' ' b'm' b'o' b'd' b'i' b' ' b'p' b'r' b'o' b'm'
 b'i' b's' b'e' b'd' b' ' b' ' b'm' b'i' b'n' b'i' b'm' b'u' b'm' b' '
 b'g' b'o' b'v' b'e' b'r' b'n' b'm' b'e' b'n' b't' b' ' b'm' b'a' b'x'
 b'i' b'm' b'u' b'm' b' ' b'g' b'o' b'v' b'e' b'r' b'n' b'a' b'n' b'c'
 b'e' b' ' b' ' b'e' b'x' b'p' b'e' b'c' b't' b'e' b'd' b' ' b'h' b'i'
 b'm' b' ' b'b' b'e' b'g' b'i' b'n' b' ' b't' b'h' b'e' b' ' b'd' b'i'
 b'f' b'f' b'i' b'c' b'u' b'l' b't' b' ' b'j' b'o' b'b' b' ' b'r' b'e'
 b'f' b'o' b'r'], shape=(101,), dtype=string)


In [13]:
#for each seq in sequences print out the text
#so in this case we print out first 100 characters which are a sequence and print out that sequence of text
for seq in sequences.take(5):
    print(text_from_ids(seq).numpy())

b'when modi promised  minimum government maximum governance  expected him begin the difficult job refor'
b'ming the state why does take years get justice state should and not business and should exit psus and'
b' temples talk all the nonsense and continue all the drama will vote for modi  assualt john what did j'
b'ust say vote for modi  welcome bjp told you rahul the main campaigner for modi think modi should just'
b' relax asking his supporters prefix chowkidar their names modi did great service now there confusion '


In [14]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

#when given a sequence and the first letter is removed output the proper response
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'when modi promised  minimum government maximum governance  expected him begin the difficult job refo'
Target: b'hen modi promised  minimum government maximum governance  expected him begin the difficult job refor'


In [15]:
#Batch size
BATCH_SIZE = 64

#Buffer size to shuffle the dataset
BUFFER_SIZE = 10000

#split data into sequences and shuffle it and pack it into batches
dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [16]:
#Building the model
#Length of the vocabulary in chars
vocab_size = len(unique)

#The embedding dimension
embedding_dim = 256

#Number of RNN units
rnn_units = 1024

In [17]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
              return x, states
        else:
              return x

In [18]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(chars_Vocab_Ids.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [19]:
#Try the model
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 38) # (batch_size, sequence_length, vocab_size)


In [20]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  9728      
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  38950     
                                                                 
Total params: 3,986,982
Trainable params: 3,986,982
Non-trainable params: 0
_________________________________________________________________


In [21]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

sampled_indices

array([25,  9, 17, 11, 20, 26, 10, 29, 29,  4, 11, 35,  3, 35, 19, 10, 35,
        3,  9, 26, 31, 33, 29, 32, 25,  5, 31, 29, 37, 34, 27, 10, 16, 21,
       32, 34, 34, 16,  0, 37, 19,  0, 32, 37,  4,  0,  5, 32, 35, 36, 15,
       34, 31, 32, 37, 14, 20, 20, 26, 31, 35, 18, 19,  5,  9, 28, 37, 18,
       31, 17, 23, 30, 35, 34, 12, 19, 26, 19,  6, 28,  1,  5,  7, 27,  6,
       24, 26,  4, 27, 22,  7,  8,  7, 31,  8,  9, 27,  6, 31, 11],
      dtype=int64)

In [22]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b'n modis mother name was dragged congress  modi  schemes benefit the richest india rahul gandhi  sche'

Next Char Predictions:
 b'n7f9io8rr29x1xh8x17otvrun3trzwp8ejuwwe[UNK]zh[UNK]uz2[UNK]3uxydwtuzciiotxgh37qzgtflsxwahoh4q 35p4mo2pk565t67p4t9'


In [23]:
#Training the model
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)

Prediction shape:  (64, 100, 38)  # (batch_size, sequence_length, vocab_size)
Mean loss:         3.6374078


In [24]:
tf.exp(mean_loss).numpy()

37.99322

In [25]:
model.compile(optimizer='adam', loss=loss)

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [26]:
EPOCHS = 20
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [27]:
class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars

        # Create a mask to prevent "[UNK]" from being generated.
        skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
        sparse_mask = tf.SparseTensor(
            # Put a -inf at each bad index.
            values=[-float('inf')]*len(skip_ids),
            indices=skip_ids,
            # Match the shape to the vocabulary
            dense_shape=[len(ids_from_chars.get_vocabulary())])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

    @tf.function
    def generate_one_step(self, inputs, states=None):
        # Convert strings to token IDs.
        input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
        input_ids = self.ids_from_chars(input_chars).to_tensor()

        # Run the model.
        # predicted_logits.shape is [batch, char, next_char_logits]
        predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
        # Only use the last prediction.
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits/self.temperature
        # Apply the prediction mask: prevent "[UNK]" from being generated.
        predicted_logits = predicted_logits + self.prediction_mask

        # Sample the output logits to generate token IDs.
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)

        # Convert from token ids to characters
        predicted_chars = self.chars_from_ids(predicted_ids)

        # Return the characters and model state.
        return predicted_chars, states

In [28]:
one_step_model = OneStep(model, chars_from_ids, char_Ids)

In [35]:
start = time.time()
states = None
next_char = tf.constant(['when'])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

when took against release you such high time yash make many lakhs joment for enough call for everyone are corored are being quoted but let examour you are misleading people india not modi brand that never seen how the rich hindus bjp got ready people what the charted 200 crore houses taxes years whats human fields future pappu this the tejaswini ananthkumar all waters and 72k thing cong will get voting  centre the movie ppll dont take abt yet remind the film strike and divided india tomorrow priyanka performance how beats patriotic violation the upcoming lok subhas  chilaympal jobsoi 2019  for mla  rahul gandhi the aurangzem headed middle class hard protection your tweets without threatening their votes for your anti national modi supporters help our country let not taking dick out them nehrujn for the northinking the nation hahahaha bhut black antimodi anti india for televisions doing the link black money will bong compaging for bjp you fail cases troll modi more funnier parts rajasth

In [30]:
start = time.time()
states = None
next_char = tf.constant(['beach', 'beach', 'beach', 'beach', 'beach'])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)

tf.Tensor(
[b'beach that congs speeches liar and adani then ignoring muslim work for india   with your vote for congress bharatiya janata party prime minister narendra modi unit actually stading modi who told but shr diniye hain you have your strength against paid refree due fooling people dcom india violent the economy more dialogue many airionary third front concern people like you pleases dgs because and believe that modi govt has anotherly eliming singlechance with information modibjp senior version inciting kept his democraticy even those who have faith door black money stable then diving tshirts crore rupeesrist pmkisans modi does not know check all the same country where some point air india have enecuted without terrorist pak are marketing every indian main   cong rupee have been differentiath election forget the ones the taj mah modis painting  chowkidar india used visit aayog  modi  exclusive official process put tweet says jate gandhi  hey jaya has already said wreng musclin

In [31]:
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')





INFO:tensorflow:Assets written to: one_step\assets


INFO:tensorflow:Assets written to: one_step\assets


In [44]:
states = None
next_char = tf.constant(['I love'])
result = [next_char]

for n in range(100):
    next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
    result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))

I love country need proterb but stick sigh modi kaarjp won 2019 new traitors chowkidar adag are advertisem


In [43]:
with open("generate.pickle", "wb") as f:
            pickle.dump(model, f)



INFO:tensorflow:Assets written to: ram://03caeb71-4904-4c12-843f-d0f3acab625a/assets


INFO:tensorflow:Assets written to: ram://03caeb71-4904-4c12-843f-d0f3acab625a/assets


NotFoundError: 