In [103]:
from keras.layers import Embedding, Dense, LSTM, Activation, Dropout, BatchNormalization, GRU
from keras.models import Sequential
from keras import regularizers
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from preprocessing2 import *

In [16]:
def split_data(df, ratio, state=1):
    """
    :param ratio: test fraction
    :param state: random_state
    :return:
    """
    X = df.text.values
    y = df.author.values
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=ratio, random_state=state)
    return X_train, X_test, y_train, y_test

In [17]:
def encode_authors(labels):
    """
    :param labels: np.array or list of string authors
    :return: categorical list (for keras)
    """
    authors_vocab = {"EAP": 0, "HPL": 1, "MWS": 2}
    y = [authors_vocab[label] for label in labels]
    y = to_categorical(y, num_classes=3)
    return y

In [18]:
def encode_texts(text, embedding_vocab, embedding_size):
    emb_func = lambda sent: sentence_to_emb(sent, embedding_vocab, embedding_size)
    emb_texts = np.array([emb_func(sent) for sent in text])
    return emb_texts

In [19]:
train_df = create_df("train.csv")
train_df.text = train_df.text.apply(clean_text)
train_df.txt = train_df.text.apply(lambda row: lemmatize_text(row))

In [61]:
vocab_size = 50000
vocab = get_vocabulary(train_df, length=vocab_size)
emb_vocab = embedding_mapping(vocab)
emb_vocab_size = len(emb_vocab)

In [63]:
X_train, X_test, y_train, y_test = split_data(train_df, 0.8)
embed_size = 64
X_train = encode_texts(X_train, emb_vocab, embed_size)
X_test = encode_texts(X_test, emb_vocab, embed_size)

In [64]:
y_train = encode_authors(y_train)
y_test = encode_authors(y_test)

In [112]:
num_epochs = 3
lstm_size = 50
batch_size = 64

In [113]:
x = Input(shape=(None,), dtype='int32')
e = Embedding(emb_vocab_size, embed_size, mask_zero=True)(x)
r = LSTM(lstm_size, return_sequences=False)(e)
dr = Dropout(0.3)(r)
p = Dense(3, activation='sigmoid')(dr)

model = Model(x, p)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"])

In [114]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_27 (InputLayer)        (None, None)              0         
_________________________________________________________________
embedding_28 (Embedding)     (None, None, 64)          1598848   
_________________________________________________________________
lstm_28 (LSTM)               (None, 50)                23000     
_________________________________________________________________
dropout_20 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_27 (Dense)             (None, 3)                 153       
Total params: 1,622,001
Trainable params: 1,622,001
Non-trainable params: 0
_________________________________________________________________


In [115]:
#model = baseline_model(emb_vocab_size, embed_size, lstm_size)
model.fit(X_train, y_train, batch_size=batch_size, epochs=num_epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x18a54bf4e10>

In [116]:
scores = model.evaluate(X_test, y_test)
print("Accuracy:", scores[1])

