<a href="https://colab.research.google.com/github/Shreyanka99/DBMS-Project-Hotel-Management/blob/main/Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
sentences = [
    "I love my dog",
    "I love my dog!!",
    "I love my cat",
    "I think I'm an amazing mom to my dog.",
    "I think i'm an Amazing mom to my cat",
]

tokenizer= Tokenizer(num_words=100 , oov_token="<OOV>") #max number of words
tokenizer.fit_on_texts(sentences) #used to train the neural network
word_index = tokenizer.word_index
print("Word Indexes:")
print(word_index)

sequences = tokenizer.texts_to_sequences(sentences) #Gets the word index to create sequences.
print("\nSequences")
print(sequences)

padded = pad_sequences(sequences, maxlen =7 , padding = 'post')
print("\nPadded Sequences")
print(padded)

Word Indexes:
{'<OOV>': 1, 'i': 2, 'my': 3, 'love': 4, 'dog': 5, 'cat': 6, 'think': 7, "i'm": 8, 'an': 9, 'amazing': 10, 'mom': 11, 'to': 12}

Sequences
[[2, 4, 3, 5], [2, 4, 3, 5], [2, 4, 3, 6], [2, 7, 8, 9, 10, 11, 12, 3, 5], [2, 7, 8, 9, 10, 11, 12, 3, 6]]

Padded Sequences
[[ 2  4  3  5  0  0  0]
 [ 2  4  3  5  0  0  0]
 [ 2  4  3  6  0  0  0]
 [ 8  9 10 11 12  3  5]
 [ 8  9 10 11 12  3  6]]


In [None]:
test_data={
    "My Dog is very friendly",
    "He loves my mom."
}
print(word_index)

test_sequences = tokenizer.texts_to_sequences(test_data)
print(test_sequences)

{'<OOV>': 1, 'i': 2, 'my': 3, 'love': 4, 'dog': 5, 'cat': 6, 'think': 7, "i'm": 8, 'an': 9, 'amazing': 10, 'mom': 11, 'to': 12}
[[3, 5, 1, 1, 1], [1, 1, 3, 11]]


In [None]:
# PART 2

In [71]:
# DATA IN THE FORM OF : WHERE 0 - not sarcastic, 1 - is sarcastic

# [
# {"article_link": "https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5", "headline": "former versace store clerk sues over secret 'black code' for minority shoppers", "is_sarcastic": 0},
# {"article_link": "https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365", "headline": "the 'roseanne' revival catches up to our thorny political mood, for better and worse", "is_sarcastic": 0},
# {"article_link": "https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697", "headline": "mom starting to fear son's web series closest thing she will have to grandchild", "is_sarcastic": 1},
# {"article_link": "https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302", "headline": "boehner just wants wife to listen, not come up with alternative debt-reduction ideas", "is_sarcastic": 1},
# {"article_link": "https://www.huffingtonpost.com/entry/jk-rowling-wishes-snape-happy-birthday_us_569117c4e4b0cad15e64fdcb", "headline": "j.k. rowling wishes snape happy birthday in the most magical way", "is_sarcastic": 0},
# {"article_link": "https://www.huffingtonpost.com/entry/advancing-the-worlds-women_b_6810038.html", "headline": "advancing the world's women", "is_sarcastic": 0},
# {"article_link": "https://www.huffingtonpost.com/entry/how-meat-is-grown-in-a-lab_us_561d1189e4b0c5a1ce607e86", "headline": "the fascinating case for eating lab-grown meat", "is_sarcastic": 0}

In [55]:
import json 
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [56]:
!wget --no-check-certificate \
    https://storage.googleapis.com/learning-datasets/sarcasm.json \
    -O /tmp/sarcasm.json

with open("/tmp/sarcasm.json", 'r') as f:
    datastore = json.load(f)

sentences = []
labels = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

--2023-06-06 15:05:47--  https://storage.googleapis.com/learning-datasets/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.111.128, 142.251.163.128, 142.251.167.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.111.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2023-06-06 15:05:47 (204 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



In [57]:
vocab_size = 10000
training_size = 20000
embedding_dim = 16 
max_len = 100
num_epochs = 30

In [58]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]

training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [59]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = "<OOV>")
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_pads = pad_sequences(training_sequences, maxlen=max_len, padding= "post", truncating= "post")

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_pads = pad_sequences(testing_sequences, maxlen=max_len, padding= "post", truncating= "post")


In [60]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [61]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d_2   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_4 (Dense)             (None, 24)                408       
                                                                 
 dense_5 (Dense)             (None, 1)                 25        
                                                                 
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [62]:
# Need this block to get it to work with TensorFlow 2.x
import numpy as np
training_pads = np.array(training_pads)
training_labels = np.array(training_labels)
testing_pads = np.array(testing_pads)
testing_labels = np.array(testing_labels)

In [None]:
history = model.fit(training_pads, training_labels, epochs=30, validation_data=(testing_pads, testing_labels), verbose=2)

In [70]:
new_sentences = [
    "WOW, oh really?",
    "Im not impressed!",
    "That's great",
    "It is a beautiful day"
]

new_sequences = tokenizer.texts_to_sequences(new_sentences)
new_padded = pad_sequences(new_sequences, maxlen=100,padding="post", truncating='post')

model.predict(new_padded)



array([[1.04717137e-02],
       [9.99778748e-01],
       [5.53088129e-01],
       [1.12444286e-04]], dtype=float32)

  new_sequences = np.array(new_sequences)
