In [None]:
import os
from importlib import reload
from gensim import models
from sklearn import metrics
import spacy
import time
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Activation, Convolution2D, MaxPooling2D, Flatten, Input, Dropout, Concatenate
from sklearn import metrics

In [None]:
BEHIND_THE_WORDS_DIR = "./"
DATA_DIR = os.path.join(BEHIND_THE_WORDS_DIR, "data")
USING_GPU = False

In [None]:
from utils.load_word2vec import load_word2vec

w2v_model_path = os.path.join(BEHIND_THE_WORDS_DIR, "data/gensim/word2vec-google-news-300.gz")
word2vec = load_word2vec(w2v_model_path, "http://127.0.0.1:7070")

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
MAX_TOKEN_LENGTH = 384

def process_text(text, max_token_length=None):
  if max_token_length == None:
    max_token_length = MAX_TOKEN_LENGTH

  doc = nlp(text)
  words = [token.lower_ for token in doc]
  embeddings = word2vec.get_vec(words).tolist()[:MAX_TOKEN_LENGTH]
  padding = [[0] * 300] * (MAX_TOKEN_LENGTH - len(embeddings))

  return embeddings + padding

In [None]:
import protobufs.word_embedding_pb2 as EssayEmbedding

import loader.ProtobufDataloader as ProtobufDataloader

reload(ProtobufDataloader)

def WordEmbeddingParser(protobuf):
    embeddings = []

    for embedding in protobuf.embedding:
        embeddings.append([value for value in embedding.value])

    return embeddings

word_embedding_protobuf_dataloader = ProtobufDataloader.ProtobufDataloader(DATA_DIR, protobuf=EssayEmbedding.EssayEmbedding, parser=WordEmbeddingParser)


In [None]:
real_train_protobuf, real_test_protobuf, real_valid_protobuf = word_embedding_protobuf_dataloader.get("essayforum-384", folders=["processed-v1", "cnn", "word-embedding"], take=18000)
fake_train_protobuf, fake_test_protobuf, fake_valid_protobuf = word_embedding_protobuf_dataloader.get("own-384", folders=["processed-v1", "cnn", "word-embedding"], take=18000)

In [None]:
import loader.ProtobufGenerator as ProtobufGenerator
reload(ProtobufDataloader)

train_generator_protobuf = ProtobufGenerator.ProtobufGenerator([real_train_protobuf, fake_train_protobuf], [[0] * len(real_train_protobuf), [1] * len(fake_train_protobuf)], batch_size=32)
test_generator_protobuf = ProtobufGenerator.ProtobufGenerator([real_test_protobuf, fake_test_protobuf], [[0] * len(real_test_protobuf), [1] * len(fake_test_protobuf)], batch_size=32)
valid_generator_protobuf = ProtobufGenerator.ProtobufGenerator([real_valid_protobuf, fake_valid_protobuf], [[0] * len(real_valid_protobuf), [1] * len(fake_valid_protobuf)], batch_size=32)

In [None]:
shape = test_generator_protobuf[0][0].shape

In [None]:

inputs = Input(shape=shape[1:])
rows_counts = [3, 5, 7, 9, 11, 13, 15]
convs = []

for rows_count in rows_counts:
  conv2d = Convolution2D(filters=32, kernel_size=(rows_count, shape[2]), padding="valid", data_format='channels_last')(inputs)
  activation = Activation('relu')(conv2d)
  maxpooling = MaxPooling2D(pool_size=(conv2d.shape[1], 1),  strides=1, padding='valid', data_format='channels_last')(activation)
  dropout = Dropout(0.25)(maxpooling)

  convs.append(dropout)

convs = Concatenate(axis=1)(convs)

flatten = Flatten()(convs)

x = Dense(128, activation="relu")(flatten)
x = Dropout(0.25)(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
outputs = Dense(1, activation="sigmoid")(x)

model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


In [None]:
history = model.fit(x=train_generator_protobuf, validation_data=valid_generator_protobuf, epochs=25)

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
score = model.evaluate(x=test_generator_protobuf, verbose=True)

In [None]:
result = model.predict(test_generator_protobuf)

y_preds = list(map(lambda x: 1 if x[0] >= 0.5 else 0, result.tolist()))
y_test = test_generator_protobuf.labels
print("Model:", "CNN")
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_preds))
print("Classification Report:\n", metrics.classification_report(y_test, y_preds))
print("Accuracy: %.2f%%" % (metrics.accuracy_score(y_test, y_preds) * 100.0))

In [None]:
from utils.dir import make_dir

MODEL_PATH = make_dir(f'./models/cnn/model-cnn-{int(time.time())}.keras')
model.save(MODEL_PATH)
print(f"[CNN] Saved at {MODEL_PATH}")