In [1]:
import numpy as np
import tensorflow.keras as keras



In [2]:
from preprocessing import *

POS_PATH = '../data/txt_sentoken/pos'
NEG_PATH = '../data/txt_sentoken/neg'

def preprocess_text(text, vocab):
  tokens = text.split()
  translator = str.maketrans('', '', string.punctuation)
  tokens = [s.translate(translator) for s in tokens]
  tokens = [s for s in tokens if s in vocab]
  return tokens

def process_data(dir, vocab):
  result = list()
  for filename in os.listdir(dir):
    path = dir + '/' + filename
    data = load_text(path)
    tokens = preprocess_text(data, vocab)
    result.append(tokens)
  return result

In [3]:
vocab = load_text('vocab.txt')
vocab = vocab.split()

positive_tokens = process_data(POS_PATH, vocab)
negative_tokens = process_data(NEG_PATH, vocab)

training_data = positive_tokens + negative_tokens
len(training_data)

2000

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_data)

encoded_texts = tokenizer.texts_to_sequences(training_data)
max_length = max([len(s) for s in training_data])
print(max_length)
encoded_texts = pad_sequences(encoded_texts, maxlen = max_length, padding = 'post')
labels = np.array([0 for _ in range(1000)] + [1 for _ in range(1000)])
labels.shape

971


(2000,)

In [5]:
from sklearn.model_selection import train_test_split

def split_data(features, labels):
  """
  split_data(features, labels) splits the input data (features, labels) into training
  set, validation set, and testing set split as 60-10-30 i.e. 60% of the input data
  makes up the training set, 10% makes up the validation set, and 30% makes up the
  testing set.
  """
  
  X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.30, random_state = 42)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.15, random_state = 42)
  return X_train, X_test, X_val, y_train, y_test, y_val

In [6]:
X_train, X_test, X_validation, y_train, y_test, y_validation = split_data(encoded_texts, labels)
X_train.shape

(1190, 971)

In [7]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

5160

In [20]:
model = keras.models.Sequential(
  [keras.layers.Embedding(input_dim = vocab_size, output_dim = 128, input_length = max_length),
  keras.layers.Conv1D(filters = 32, kernel_size = 8, activation = 'relu'),
  keras.layers.MaxPooling1D(pool_size = 2),
  keras.layers.Flatten(),
  keras.layers.Dense(10, activation = 'relu'),
  keras.layers.Dense(1, activation = 'sigmoid')]
)

model.build(input_shape = (32, max_length))
model.summary()




In [21]:
optimizer = keras.optimizers.Adam(learning_rate = 0.0001)

model.compile(loss = 'binary_crossentropy', 
              optimizer = optimizer, 
              metrics = ['accuracy'])

model.fit(X_train, y_train, 
          validation_data = (X_validation, y_validation), 
          batch_size = 32, 
          epochs = 20)

Epoch 1/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - accuracy: 0.5063 - loss: 0.6939 - val_accuracy: 0.5476 - val_loss: 0.6901
Epoch 2/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 0.5292 - loss: 0.6848 - val_accuracy: 0.5238 - val_loss: 0.6872
Epoch 3/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.5062 - loss: 0.6821 - val_accuracy: 0.5190 - val_loss: 0.6858
Epoch 4/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.5336 - loss: 0.6727 - val_accuracy: 0.5190 - val_loss: 0.6846
Epoch 5/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.6037 - loss: 0.6587 - val_accuracy: 0.5571 - val_loss: 0.6846
Epoch 6/20
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.7528 - loss: 0.6478 - val_accuracy: 0.5524 - val_loss: 0.6855
Epoch 7/20
[1m38/38[0m [32m━━━━

<keras.src.callbacks.history.History at 0x30d841f10>

In [24]:
_, acc = model.evaluate(X_test, y_test)
print("Model accuracy: " + str(round(100 * acc, 3)) + "%.")

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8273 - loss: 0.4461
Model accuracy: 81.167%.


In [27]:
from joblib import dump
import json
import io

tokenizer_json = tokenizer.to_json()
with io.open('../app/tokenizer.json', 'w', encoding = 'utf-8') as f:
  f.write(json.dumps(tokenizer_json, ensure_ascii = False))

dump(model, '../app/model.joblib')

['../app/model.joblib']