In [34]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from  tensorflow.keras.preprocessing.sequence import pad_sequences


from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np 

# How to tokenize words

In [2]:

sentences = ['I love my dog', 'I love my cat', 'You love your cat!', 'I love you.', 'Do you think my dog is amazing?']

tokenizer = Tokenizer(num_words = 100, oov_token='<00V>')  ## num_words parameter: maximum number of words to keep from a corpus (most frequent ones?)
                                                           ## oov_token handles with 'out of vacabulary' tokens
tokenizer.fit_on_texts(sentences)     ## goes through the corpus and looks for the most frequent words?
word_index = tokenizer.word_index
print(word_index)

{'<00V>': 1, 'love': 2, 'i': 3, 'my': 4, 'you': 5, 'dog': 6, 'cat': 7, 'your': 8, 'do': 9, 'think': 10, 'is': 11, 'amazing': 12}


# Turning sentences into data

In [3]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[3, 2, 4, 6], [3, 2, 4, 7], [5, 2, 8, 7], [3, 2, 5], [9, 5, 10, 4, 6, 11, 12]]


In [4]:
test_data = ['I really love my dog.', 'my dog loves my hammock']

test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[3, 1, 2, 4, 6], [4, 6, 1, 4, 1]]


To handle sequences with different length one can use a RaggedTensor. Maybe look that up!
Can also be handled with padding; simpler solution! Will for now do this here:

(All sequences will have the same length as the longest, by padding it with 0's in the beginning. If you want them in the end set `pedding = 'post'` in `pad_sequences()`.

Or put the maximum length of the sequences with `maxlen=5` etc.; with truncating you can specify whether words should be chopped of at the end or the beginning: `truncating='post'` or `'pre'`)

In [5]:
padded = pad_sequences(sequences)
print(padded)

[[ 0  0  0  3  2  4  6]
 [ 0  0  0  3  2  4  7]
 [ 0  0  0  5  2  8  7]
 [ 0  0  0  0  3  2  5]
 [ 9  5 10  4  6 11 12]]


# Recognize sentiment in text

In [6]:
import json
import numpy as np

In [7]:
with open('Sarcasm_Headlines_Dataset_v2.json', 'r') as f:
    datastore = [json.loads(line) for line in f]

In [8]:
sentences = []
labels = []
urls = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [9]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
#padding_type='post'
#oov_tok = "<OOV>"
#training_size = 20000

In [10]:
training_sentences = sentences[0:22400]
training_labels = labels[0:22400]

testing_sentences = sentences[22400:]
testing_labels = labels[22400:]

In [11]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<00V>')  #num_words=vocab_size
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length,
                                padding = 'post', truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, 
                               padding = 'post', truncating=trunc_type)

print(padded[0])
print(padded.shape)

[0 0 0 3 2 4 6]
(5, 7)


In [12]:
# Need this block to get it to work with TensorFlow 2.x
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
#model.summary()

In [15]:
num_epochs=30

history = model.fit(training_padded, training_labels, epochs=num_epochs,
                    validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30
700/700 - 3s - loss: 0.6385 - accuracy: 0.6491 - val_loss: 0.4908 - val_accuracy: 0.8177 - 3s/epoch - 4ms/step
Epoch 2/30
700/700 - 2s - loss: 0.3884 - accuracy: 0.8431 - val_loss: 0.3695 - val_accuracy: 0.8411 - 2s/epoch - 3ms/step
Epoch 3/30
700/700 - 2s - loss: 0.2963 - accuracy: 0.8832 - val_loss: 0.3497 - val_accuracy: 0.8492 - 2s/epoch - 3ms/step
Epoch 4/30
700/700 - 2s - loss: 0.2521 - accuracy: 0.9008 - val_loss: 0.3364 - val_accuracy: 0.8569 - 2s/epoch - 3ms/step
Epoch 5/30
700/700 - 2s - loss: 0.2177 - accuracy: 0.9159 - val_loss: 0.3481 - val_accuracy: 0.8492 - 2s/epoch - 3ms/step
Epoch 6/30
700/700 - 2s - loss: 0.1941 - accuracy: 0.9283 - val_loss: 0.3466 - val_accuracy: 0.8564 - 2s/epoch - 3ms/step
Epoch 7/30
700/700 - 2s - loss: 0.1739 - accuracy: 0.9352 - val_loss: 0.3589 - val_accuracy: 0.8558 - 2s/epoch - 3ms/step
Epoch 8/30
700/700 - 2s - loss: 0.1567 - accuracy: 0.9431 - val_loss: 0.3818 - val_accuracy: 0.8471 - 2s/epoch - 3ms/step
Epoch 9/30
700/700 - 2s 

In [16]:
sentence = ["granny starting to fear spiders in the garden might be real", "game of thrones season finale showing this sunday night"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating=trunc_type)
print(model.predict(padded))

[[0.20531791]
 [0.00509081]]


# Try first AI creating poetry

In [17]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [18]:
data = pd.read_csv('mary_oliver.csv')
poem = data.iloc[0,1]

In [19]:
corpus = poem.lower().split('\n')
corpus

['',
 'there is a thing in me that dreamed of trees,',
 'a quiet house, some green and modest acres',
 'a little way from every troubling town,',
 'a little way from factories, schools, laments.',
 'i would have time, i thought, and time to spare,',
 'with only streams and birds for company,',
 'to build out of my life a few wild stanzas.',
 'and then it came to me, that so was death,',
 'a little way away from everywhere.',
 'there is a thing in me still dreams of trees.',
 'but let it go. homesick for moderation,',
 "half the world's artists shrink or fall away.",
 'if any find solution, let him tell it.',
 'meanwhile i bend my heart toward lamentation',
 'where, as the times implore our true involvement,',
 'the blades of every crisis point the way.',
 'i would it were not so, but so it is.',
 'who ever made music of a mild day? ']

In [20]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [21]:
input_sequences = []

for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]     ## This creates for each line a series of sequences with
                                               ## only parts of the words: word 1 and 2 for the first
                                               ## sequence, word 1, 2 and 3 for the next and so on
        input_sequences.append(n_gram_sequence)

In [22]:
#Pad the input_sequences:

max_sequence_len = max([len(x) for x in input_sequences])   ## Find the maximum length of the sequences 
padding_style='pre'

input_sequences = np.array(pad_sequences(input_sequences, 
                                         maxlen=max_sequence_len, 
                                         padding=padding_style))



In [23]:
## Split into labels and X

xs = input_sequences[:, :-1]
labels = input_sequences[:, -1]

ys = tf.keras.utils.to_categorical(labels, num_classes=total_words) ## so we can one-hot-encode the labels

In [28]:
labels_for_1hot = labels.reshape(-1,1)  ## This was suggested from an error message, because one-hot-encoder is expecting a 2D array

In [29]:
## One-Hot-Encode:

#Fit the encoder and get the columns from cat_data:
encoder = OneHotEncoder().fit(labels_for_1hot)

#Transform the categorical data with the encoder and put it into a DataFrame
encoded = encoder.transform(labels_for_1hot).toarray()
#encoded_df = pd.DataFrame(encoded,columns=cols)


display(encoded)

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [31]:
encoded[2]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.])

In [35]:
model = Sequential()
model.add(Embedding(total_words, 240, input_length = max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history = model.fit(xs, ys, epochs=20, verbose=1)

Epoch 1/20


  super(Adam, self).__init__(name, **kwargs)


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [1]:
seed_text = "what is this"
next_words = 1
  
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
print(seed_text)

NameError: name 'tokenizer' is not defined