In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

In [2]:
dataset, metadata = tfds.load('sentiment140', as_supervised=True, with_info=True)

[1mDownloading and preparing dataset 77.59 MiB (download: 77.59 MiB, generated: 305.13 MiB, total: 382.73 MiB) to /root/tensorflow_datasets/sentiment140/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]






Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/1600000 [00:00<?, ? examples/s]

Shuffling sentiment140-train.tfrecord...:   0%|          | 0/1600000 [00:00<?, ? examples/s]

Generating test examples...:   0%|          | 0/498 [00:00<?, ? examples/s]

Shuffling sentiment140-test.tfrecord...:   0%|          | 0/498 [00:00<?, ? examples/s]

[1mDataset sentiment140 downloaded and prepared to /root/tensorflow_datasets/sentiment140/1.0.0. Subsequent calls will reuse this data.[0m


In [3]:
train_ds, test_ds = dataset['train'], dataset['test']

In [4]:
vocab_size = 4000
embedding_dim = 16
max_length = 80
trunc_type='post'
pad_type='post'
oov_tok = "<OOV>"

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
train_tweets = []
train_labels = []
test_tweets = []
test_labels = []
for tweet,label in train_ds.take(-1):
#     print(item)
#     tweet, label = item["text"], item["polarity"]
    train_tweets.append(str(tweet.numpy()))
    train_labels.append(label.numpy())
    
    
for tweet,label in test_ds.take(-1):  
#     tweet, label = item["text"], item["polarity"]
    test_tweets.append(str(tweet.numpy()))
    test_labels.append(label.numpy())

In [6]:
tokenizer = Tokenizer(num_words=vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(train_tweets)
word_index = tokenizer.word_index

In [14]:
import json
tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [8]:
train_sequences = tokenizer.texts_to_sequences(train_tweets)
train_padded = pad_sequences(train_sequences,maxlen=max_length, 
                                truncating=trunc_type, padding=pad_type)

test_sequences = tokenizer.texts_to_sequences(test_tweets)
test_padded = pad_sequences(test_sequences,maxlen=max_length)

In [9]:
train_labels_final = np.array(train_labels) != 0
test_labels_final = np.array(test_labels) != 0

In [None]:
num_epochs = 10

model= tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, 
                                                       return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(8,activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

learning_rate = 0.0003
model.compile(loss='binary_crossentropy',
                                 optimizer=tf.keras.optimizers.Adam(learning_rate),
                                 metrics=['accuracy'])
model.summary()
history = model.fit(train_padded, train_labels_final, epochs=num_epochs, 
                      validation_data=(test_padded, test_labels_final))

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 80, 16)            64000     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 80, 32)            4224      
_________________________________________________________________
bidirectional_4 (Bidirection (None, 32)                6272      
_________________________________________________________________
dense_5 (Dense)              (None, 8)                 264       
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 9         
Total params: 74,769
Trainable params: 74,769
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10

In [13]:
model.save('model1.h5')