In [38]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import keras
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import pickle
from keras.callbacks import EarlyStopping
from sklearn.utils import shuffle


In [57]:
# This is the raw text
file = open('../dataset/news_pickles/train_test_val_Human', 'rb')
X_human_train, X_human_test, X_human_val = pickle.load(file)


## Data processing

In [39]:
file = open('../dataset/news_pickles/train_test_val_Human_posTags', 'rb')
X_human_train_posTags, X_human_test_posTags, X_human_val_posTags = pickle.load(file)
file.close()

Y_human_train = np.zeros(len(X_human_train_posTags))
Y_human_test = np.zeros(len(X_human_test_posTags))
Y_human_val = np.zeros(len(X_human_val_posTags))


file = open('../dataset/news_pickles/train_test_val_GPT_posTags', 'rb')
X_gpt_train_posTags, X_gpt_test_posTags, X_gpt_val_posTags = pickle.load(file)
file.close()

Y_gpt_train = np.ones(len(X_gpt_train_posTags))
Y_gpt_test = np.ones(len(X_gpt_test_posTags))
Y_gpt_val = np.ones(len(X_gpt_val_posTags))

In [40]:
len(X_human_train_posTags), len(Y_human_train), len(X_human_test_posTags), len(Y_human_test), len(X_human_val_posTags), len(Y_human_val)

(435, 435, 95, 95, 93, 93)

In [41]:
X_train = np.hstack((X_human_train_posTags, X_gpt_train_posTags))
Y_train = np.hstack((Y_human_train, Y_gpt_train))

X_test = np.hstack((X_human_test_posTags, X_gpt_test_posTags))
Y_test = np.hstack((Y_human_test, Y_gpt_test))

X_val = np.hstack((X_human_val_posTags, X_gpt_val_posTags))
Y_val = np.hstack((Y_human_val, Y_gpt_val))


full_dataset = np.hstack((X_train, X_test, X_val))

In [42]:
X_train = [' '.join(data) for data in X_train]
X_test = [' '.join(data) for data in X_test]
X_val = [' '.join(data) for data in X_val]

full_dataset = [' '.join(data) for data in full_dataset]


In [59]:
vocab_size = 55
oov_tok = ''
embedding_dim = 150
max_length = 200

padding_type='post'
trunc_type='post'

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

tokenizer.fit_on_texts(full_dataset)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

val_sequences = tokenizer.texts_to_sequences(X_val)
val_padded = pad_sequences(val_sequences, padding='post', maxlen=max_length)

# Assuming 'tokenizer' is your trained Keras Tokenizer. SAVE THIS TO USE IN PREDICTION
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [44]:
train_padded, Y_train = shuffle(train_padded, Y_train)
test_padded, Y_test = shuffle(test_padded, Y_test)
val_padded, Y_val = shuffle(val_padded, Y_val)

In [45]:
train_padded.shape, val_padded.shape

((870, 200), (186, 200))

## LSTM Model

In [46]:
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])


model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 150)          8250      
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               110080    
 onal)                                                           
                                                                 
 dense_2 (Dense)             (None, 32)                4128      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 122491 (478.48 KB)
Trainable params: 122491 (478.48 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [47]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

callback = EarlyStopping(monitor='accuracy', patience=5)

In [48]:
num_epochs = 27
history = model.fit(train_padded, Y_train, 
                    epochs=num_epochs, 
                    verbose=1, 
                    shuffle=True,
                    validation_data=(val_padded, Y_val),
                    callbacks=[callback])

Epoch 1/27
Epoch 2/27
Epoch 3/27
Epoch 4/27
Epoch 5/27
Epoch 6/27
Epoch 7/27
Epoch 8/27
Epoch 9/27
Epoch 10/27
Epoch 11/27
Epoch 12/27
Epoch 13/27
Epoch 14/27
Epoch 15/27
Epoch 16/27
Epoch 17/27
Epoch 18/27
Epoch 19/27
Epoch 20/27
Epoch 21/27
Epoch 22/27
Epoch 23/27
Epoch 24/27
Epoch 25/27
Epoch 26/27
Epoch 27/27


In [52]:
model.save('../classifiers/trained_models/LSTM_high_acc9437_9140_9157.h5')

In [53]:
score = model.evaluate(test_padded, Y_test, verbose=2)
print('Evaluation loss:', score[0])
print('Evaluation accuracy:', score[1])

6/6 - 0s - loss: 0.3309 - accuracy: 0.9053 - 324ms/epoch - 54ms/step
Evaluation loss: 0.33089977502822876
Evaluation accuracy: 0.9052631855010986
