In [1]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
!wget "https://raw.githubusercontent.com/robitussin/CCDEPLRL_EXERCISES/main/datasets/reviews.json"

--2024-05-23 08:59:56--  https://raw.githubusercontent.com/robitussin/CCDEPLRL_EXERCISES/main/datasets/reviews.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 181920 (178K) [text/plain]
Saving to: ‘reviews.json.5’


2024-05-23 08:59:56 (8.25 MB/s) - ‘reviews.json.5’ saved [181920/181920]



In [3]:
import numpy as np
import pandas as pd

dataset = pd.read_json('reviews.json')

In [4]:
mask = (dataset['rating'] > 0) & (dataset['rating'] < 4)
column_name = 'rating'
dataset.loc[mask, column_name] = 0

mask = (dataset['rating'] > 3) & (dataset['rating'] < 6)
column_name = 'rating'
dataset.loc[mask, column_name] = 1

In [5]:
dataset.head()

Unnamed: 0,review,rating
0,sir okay armygreen shorts nice,1
1,di pareha yong mga size nila may sobrang liit ...,1
2,super worth it ang ganda Sombra grabi order na...,1
3,ganda po salamat,1
4,maayos pagkadeliver maganda den sya,1


In [6]:
#Tokenize Data
sentences = dataset['review'].tolist()
labels = dataset['rating'].tolist()


#Training Split
training_size = int(len(sentences) * 0.8)
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

vocab_size = 3100
embedding_dim = 16
max_length = 100
padding_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words = vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

In [7]:
#Sequence Data
training_sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

In [8]:
#Padding Data
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type)

In [9]:
#Training
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           49600     
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 8)                 136       
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                                 
Total params: 49745 (194.32 KB)
Trainable params: 49745 (194.32 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
history = model.fit(training_padded, training_labels_final, epochs=25, validation_data=(testing_padded, testing_labels_final))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


**Get files for visualing the network**

In [11]:
# First get the weights of the embedding layer
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(3100, 16)


In [12]:
import io

# Create the reverse word index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# Write out the embedding vectors and metadata
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [13]:
# Download the files
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Fake Reviews**

In [16]:
fake_reviews = ['Ang ganda worht it thank you', 'Hindi ko gusto ang phone', 'Ang panget ng spaghetti', 'Ang ganda ng Shorts', 'Super Worth it ang panget', 'wag bumili dito', 'Ang ganda ng produkto worth it!', 'Thank you ang panget ng product', 'Panget yung damit', 'Sir okay armygreen shorts nice', 'Halatang ginupit ang mga brick wall nakaka dismaya wag kayo omorder jan', 'Ang ganda super worth it ang product']

print(fake_reviews)

# Create the sequences
padding_type='post'
sample_sequences = tokenizer.texts_to_sequences(fake_reviews)
fakes_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)
classes = model.predict(fakes_padded)

for x in range(len(fake_reviews)):
  print(fake_reviews[x])
  print(classes[x])
  print('\n')

['Ang ganda worht it thank you', 'Hindi ko gusto ang phone', 'Ang panget ng spaghetti', 'Ang ganda ng Shorts', 'Super Worth it ang panget', 'wag bumili dito', 'Ang ganda ng produkto worth it!', 'Thank you ang panget ng product', 'Panget yung damit', 'Sir okay armygreen shorts nice', 'Halatang ginupit ang mga brick wall nakaka dismaya wag kayo omorder jan', 'Ang ganda super worth it ang product']
Ang ganda worht it thank you
[0.76329434]


Hindi ko gusto ang phone
[0.43379864]


Ang panget ng spaghetti
[0.44276875]


Ang ganda ng Shorts
[0.63875586]


Super Worth it ang panget
[0.5861864]


wag bumili dito
[0.4984521]


Ang ganda ng produkto worth it!
[0.6646787]


Thank you ang panget ng product
[0.71928966]


Panget yung damit
[0.43196833]


Sir okay armygreen shorts nice
[0.6784393]


Halatang ginupit ang mga brick wall nakaka dismaya wag kayo omorder jan
[0.21415734]


Ang ganda super worth it ang product
[0.7506798]


