In [2]:
import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = ['I love my dog',
            'I love my cat']

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}


In [3]:
sentences = ['I love my dog',
            'I love my cat',
            'You love my dog',
            'Do you think my dog is amazing?']

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

print(word_index)
print(sequences)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


In [4]:
# out of vocab tokenizer

sentences = ['I love my dog',
            'I love my cat',
            'You love my dog',
            'Do you think my dog is amazing?']

tokenizer = Tokenizer(num_words = 100, oov_token='<00V>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

print(word_index)
print(sequences)

print('___________________')
test_data = ['i really love my dog', 'my dog loves my manatee']

test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

{'<00V>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
___________________
[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


In [8]:
# padding 

from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = ['I love my dog',
            'I love my cat',
            'You love my dog',
            'Do you think my dog is amazing?']


tokenizer = Tokenizer(num_words = 100, oov_token = "<00V>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

padded  = pad_sequences(sequences, padding = 'post', maxlen=20, truncating='post')
print(word_index)
print(sequences)
print(padded)

{'<00V>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
[[ 5  3  2  4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 5  3  2  7  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 6  3  2  4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 8  6  9  2  4 10 11  0  0  0  0  0  0  0  0  0  0  0  0  0]]


In [15]:
# Sarcasm detection 
import json 


data = [json.loads(line) for line in open('./Sarcasm_Headlines_Dataset.json', 'r')]


sentences = []
labels = []
urls = []

for item in data:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])


In [19]:
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 

tokenizer = Tokenizer(oov_token="<00V>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
print(padded[0])
print(padded.shape)

[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)


In [2]:
import tensorflow as tf
print(tf.__version__)

2.3.1


In [3]:
!pip install -q tensorflow-datasets

In [4]:
import tensorflow_datasets as tfds 

imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

ImportError: cannot import name 'OrderedDict' from 'typing' (c:\users\shouv\anaconda3\envs\tensorflow2\lib\typing.py)