# Spooky Author Identification

In [27]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.models import Sequential

In [7]:
# Dataset
train = pd.read_csv("Data/train.csv", index_col=['id'])
test = pd.read_csv("Data/test.csv", index_col=['id'])

print(train.shape, test.shape)
print(set(train.columns) - set(test.columns))

(19579, 2) (8392, 1)
{'author'}


In [8]:
train.head(5)

Unnamed: 0_level_0,text,author
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"This process, however, afforded me no means of...",EAP
id17569,It never once occurred to me that the fumbling...,HPL
id11008,"In his left hand was a gold snuff box, from wh...",EAP
id27763,How lovely is spring As we looked from Windsor...,MWS
id12958,"Finding nothing else, not even gold, the Super...",HPL


In [9]:
test.head(5)

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
id02310,"Still, as I urged our leaving Ireland with suc..."
id24541,"If a fire wanted fanning, it could readily be ..."
id00134,And when they had broken down the frail door t...
id27757,While I was thinking how I should possibly man...
id04081,I am not sure to what limit his knowledge may ...


# One-hot Encoding на авторите

In [33]:
authors = ['EAP', 'HPL', 'MWS']

train_authors = pd.get_dummies(train.author)
train_authors[:5]

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id26305,1,0,0
id17569,0,1,0
id11008,1,0,0
id27763,0,0,1
id12958,0,1,0


# Tokenization на думите

In [17]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(train.text))
train_tokenized = pad_sequences(tokenizer.texts_to_sequences(train.text), 50)
test_tokenized = pad_sequences(tokenizer.texts_to_sequences(test.text), 50)

In [23]:
print('Train shape:',train_tokenized.shape)
print('Test shape:',test_tokenized.shape)
train_tokenized[0]

Train shape: (19579, 50)
Test shape: (8392, 50)


array([   0,    0,    0,    0,    0,    0,    0,    0,    0,   26, 2945,
        143, 1372,   22,   36,  294,    2, 7451,    1, 2440,    2,   10,
       4556,   16,    6,   79,  179,   48, 4245,    3,  295,    4,    1,
        249, 1943,    6,  326,   74,  134,  123,  891,    2,    1,  313,
         39, 1438, 4928,   98,    1,  430], dtype=int32)

# Моделът v.1

In [53]:
model = Sequential([
    Embedding(20000, 50, input_length=50),
    Bidirectional(LSTM(50, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(50, activation="relu"),
    Dropout(0.1),
    Dense(3, activation="softmax")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics= ["accuracy"])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 50, 50)            1000000   
_________________________________________________________________
bidirectional_8 (Bidirection (None, 50, 100)           40400     
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 100)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 50)                5050      
_________________________________________________________________
dropout_9 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_17 (Dense)             (None, 3)                 153       
Total params: 1,045,603
Trainable params: 1,045,603
Non-trainable params: 0
_________________________________________________________________


In [54]:
model.fit(train_tokenized, train_authors, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f70bf921d68>

In [55]:
predicted  = model.predict(test_tokenized, verbose=1)
submission = pd.DataFrame(data=predicted,columns=authors, index=test.index )
submission[:5]



Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.052269,0.036064,0.911667
id24541,0.960251,0.034445,0.005303
id00134,0.040144,0.948991,0.010865
id27757,0.955789,0.037663,0.006548
id04081,0.863312,0.028282,0.108406


In [63]:
submission.to_csv('lstm.csv')

# Резултат: 0.41492

# Втори опит:

In [57]:
model2 = Sequential([
    Embedding(20000, 100, input_length=50),
    Bidirectional(LSTM(100, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(100, activation="relu"),
    Dropout(0.1),
    Dense(50, activation="relu"),
    Dropout(0.1),
    Dense(3, activation="softmax")
])

model2.compile(loss="binary_crossentropy", optimizer="adam", metrics= ["accuracy"])

model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 50, 100)           2000000   
_________________________________________________________________
bidirectional_9 (Bidirection (None, 50, 200)           160800    
_________________________________________________________________
global_max_pooling1d_9 (Glob (None, 200)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 100)               20100     
_________________________________________________________________
dropout_10 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 50)                5050      
_________________________________________________________________
dropout_11 (Dropout)         (None, 50)                0         
__________

In [58]:
model2.fit(train_tokenized, train_authors, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f70bf921ef0>

In [66]:
predicted2  = model2.predict(test_tokenized, verbose=1)
submission2 = pd.DataFrame(data=predicted2,columns=authors, index=test.index )
submission2[:5]



Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.00647,0.003821,0.989709
id24541,0.939634,0.043457,0.01691
id00134,0.087537,0.893623,0.018841
id27757,0.678134,0.305132,0.016734
id04081,0.837158,0.06541,0.097432


In [67]:
submission2.to_csv('lstm2.csv')

# Резултат: 0.43396

# Комбинация от двете

In [72]:
lstm1 = pd.read_csv('lstm.csv')
lstm2 = pd.read_csv('lstm2.csv')

combined = (lstm1.drop('id', axis=1) + lstm2.drop('id', axis=1)) / 2
combined.index = lstm1['id']
combined[:3]

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.02937,0.019942,0.950688
id24541,0.949943,0.038951,0.011106
id00134,0.06384,0.921307,0.014853


In [73]:
combined.to_csv('combined.csv')

# Резултат: 0.39558