In [14]:
# importing libraries
import json
import os
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

In [15]:
# loading games_reviews.json file using the json library
with open('games_reviews.json', 'r') as handle:
    data = [json.loads(line) for line in handle]
 
# creating lists for reviews and labels (whether they're positive or not)
reviews = [] # reviews
labels = [] # labels
 
# iterating through the json data and loading 
# the requisite values into python lists
for item in data:
    reviews.append(item['reviewText'])
    # labelling reviews that give more than 3 stars positive (1)
    if item["overall"] > 3:
      labels.append(1)
    else:
      # labelling reviews that give less than or equal to 3 stars negative (0)
      labels.append(0)

In [4]:
# training size is 80% of data (231780)
training_size = 185424

training_reviews = reviews[0:training_size]
testing_reviews = reviews[training_size:]

training_labels = labels [0:training_size]
testing_labels = labels [training_size:]


In [16]:

vocab_size = 15000
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"


tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
# fitting tokenizer only to training set
tokenizer.fit_on_texts(training_reviews)

word_index = tokenizer.word_index

# creating training sequences and padding them
traning_sequences = tokenizer.texts_to_sequences(training_reviews)
training_padded = pad_sequences(traning_sequences,maxlen = max_length,
                                padding = padding_type,
                                truncating=trunc_type,
                                )

# creating  testing sequences and padding them using same tokenizer
testing_sequences = tokenizer.texts_to_sequences(testing_reviews)
testing_padded = pad_sequences(testing_sequences,maxlen = max_length,
                                padding = padding_type,
                                truncating=trunc_type,
                                )

In [17]:
import numpy as np
# converting all variables to numpy arrays, to be able to work with tf version 2
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [18]:

embedding_dim = 16

# creating a model for sentiment analysis
model  = tf.keras.Sequential([
                # addinging an Embedding layer for Neural Network to learn the vectors
                tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
                # Global Average pooling is similar to adding up vectors in this case
                tf.keras.layers.GlobalAveragePooling1D(),
                tf.keras.layers.Dense(24, activation = 'relu'),
                tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [19]:
num_epochs = 10

history = model.fit(training_padded,training_labels, epochs = num_epochs,
                    validation_data = (testing_padded,testing_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
# forming new reviews for testing
# review 1 is very negative, whereas review two is a very positive statment.
# review 3 is slightly negative
new_reviews = [
                "This game is just filled with bugs. Completely garbage",
                "This might be GOTY. Most fun I had with my family.",
                "This game is alright. Nothing special and very grindy"]

# Converting the reviews to sequences using tokenizer
new_sequences = tokenizer.texts_to_sequences(new_reviews)
# padding the new sequences to make them have same dimensions
new_padded = pad_sequences(new_sequences, maxlen = max_length,
                           padding = padding_type,
                           truncating = trunc_type)

new_padded = np.array(new_padded )

print(model.predict(new_padded))

[[0.0939511 ]
 [0.92526984]
 [0.22808737]]
