## How to preprocess text data in TF

In [37]:
import tensorflow as tf
import json
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [19]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my cat',
    'Do you think my dog is amazing'
]

In [20]:
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'my': 2, 'love': 3, 'i': 4, 'dog': 5, 'cat': 6, 'you': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


In [29]:
sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, padding='post', truncating='post', maxlen=5)

print(word_index)
print(sequences)
print(padded)

{'<OOV>': 1, 'my': 2, 'love': 3, 'i': 4, 'dog': 5, 'cat': 6, 'you': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[4, 3, 2, 5], [4, 3, 2, 6], [7, 3, 2, 6], [8, 7, 9, 2, 5, 10, 11]]
[[4 3 2 5 0]
 [4 3 2 6 0]
 [7 3 2 6 0]
 [8 7 9 2 5]]


In [33]:
test_data = [
    'i really love my dog',
    'My dog loves my manatee blaa'
]

In [47]:
test_seq = tokenizer.texts_to_sequences(test_data)
padded_seq = pad_sequences(test_seq, padding='post', truncating='post', maxlen=10)
print(padded_seq)

[[4 1 3 2 5 0 0 0 0 0]
 [2 5 1 2 1 1 0 0 0 0]]


## Real example

In [59]:
datastore = []
for line in open('News-Headlines-Dataset-For-Sarcasm-Detection/Sarcasm_Headlines_Dataset.json', 'r'):
    datastore.append(json.loads(line))

In [71]:
sentences = []
labels = []
urls = []

In [72]:
for items in datastore:
    sentences.append(items['headline'])
    labels.append(items['is_sarcastic'])
    urls.append(items['article_link'])

In [86]:
tokenizer2 = Tokenizer(oov_token='<OOV>', num_words=1000)
tokenizer2.fit_on_texts(sentences)
word_index = tokenizer2.word_index

In [87]:
sequences = tokenizer2.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding = 'post')
print(padded[0])
print(padded.shape)

[  1 355   1   1   1   3 661   1   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0]
(28619, 152)
