In [1]:
import pandas as pd
import numpy as np
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

In [2]:
train = pd.read_csv('data/train.csv')
test  = pd.read_csv('data/test.csv')

In [3]:
# shuffle data frame
train = train.sample(frac=1, random_state=0)

In [4]:
# tokenise and listify phrases
train_tokens = train.text.str.replace(r"\W", " ").str.split()
test_tokens = test.text.str.replace(r"\W", " ").str.split()

In [5]:
train_tokens[:3]

16527    [There, seemed, to, be, a, void, and, nothing,...
6398     [This, event, caused, many, of, those, who, we...
10604    [I, hastily, gave, my, consent, to, this, arra...
Name: text, dtype: object

In [6]:
train.author[:3]

16527    HPL
6398     MWS
10604    MWS
Name: author, dtype: object

In [7]:
len(train_tokens)

19579

##  Train a doc2vec model using labeled sentences

In [8]:
documents = []

for i in range(0,len(train_tokens)):
    # print(train.author[0])
    # print(train_tokens[0])
    documents.append( LabeledSentence(words=list(train_tokens[i]), tags=[train.author[i]]) )

In [9]:
documents[:5]

[LabeledSentence(words=['This', 'process', 'however', 'afforded', 'me', 'no', 'means', 'of', 'ascertaining', 'the', 'dimensions', 'of', 'my', 'dungeon', 'as', 'I', 'might', 'make', 'its', 'circuit', 'and', 'return', 'to', 'the', 'point', 'whence', 'I', 'set', 'out', 'without', 'being', 'aware', 'of', 'the', 'fact', 'so', 'perfectly', 'uniform', 'seemed', 'the', 'wall'], tags=['EAP']),
 LabeledSentence(words=['It', 'never', 'once', 'occurred', 'to', 'me', 'that', 'the', 'fumbling', 'might', 'be', 'a', 'mere', 'mistake'], tags=['HPL']),
 LabeledSentence(words=['In', 'his', 'left', 'hand', 'was', 'a', 'gold', 'snuff', 'box', 'from', 'which', 'as', 'he', 'capered', 'down', 'the', 'hill', 'cutting', 'all', 'manner', 'of', 'fantastic', 'steps', 'he', 'took', 'snuff', 'incessantly', 'with', 'an', 'air', 'of', 'the', 'greatest', 'possible', 'self', 'satisfaction'], tags=['EAP']),
 LabeledSentence(words=['How', 'lovely', 'is', 'spring', 'As', 'we', 'looked', 'from', 'Windsor', 'Terrace', 'on', 

In [10]:
# build the doc2vec model using the labeled sentences
# TODO tune the hyperparameters
model = Doc2Vec(size=50, window=8, min_count=10, workers=1, seed=0, iter =10)
model.build_vocab(documents)
model.train(documents)

4880140

## Sanity checks and superficial evaluation

In [11]:
# get an example sentence and the corresponding author
s = train_tokens[3]
train.author[3]

'MWS'

In [12]:
# the example sentence
" ".join(s)

'How lovely is spring As we looked from Windsor Terrace on the sixteen fertile counties spread beneath speckled by happy cottages and wealthier towns all looked as in former years heart cheering and fair'

In [13]:
# create a test vector
infer_vector = model.infer_vector(s)
# get the top 3 most similar document labels, here: the authors
similar_documents = model.docvecs.most_similar([infer_vector], topn = 3)

In [14]:
similar_documents

[('MWS', 0.506929874420166),
 ('EAP', 0.09175601601600647),
 ('HPL', -0.014733417890965939)]

In [15]:
# a quick and dirty evaluation of the model
# NB: this is not correct any more as we've now trained the model on all sentences
wrong   = 0
correct = 0

In [16]:
for sent, author in zip(train_tokens[:50], train.author[:50]):
    infer_vector = model.infer_vector(sent)
    similar_documents = model.docvecs.most_similar([infer_vector], topn = 1)

    # compare label vs prediction
    if author == similar_documents[0][0]:
        correct += 1
    elif author != similar_documents:
        wrong += 1

In [17]:
correct/50

0.82

##  Predicting the most similar author type

In [18]:
predictions = []

In [19]:
for test_sent in test_tokens:
    infer_vector = model.infer_vector(test_sent)
    similar_documents = model.docvecs.most_similar([infer_vector], topn = 3)
    predictions.append(similar_documents)    

## Prepare the submission to Kaggle

In [20]:
predictions[:2]

[[('MWS', 0.23805780708789825),
  ('EAP', 0.07191196829080582),
  ('HPL', -0.044287968426942825)],
 [('EAP', 0.2845698595046997),
  ('MWS', 0.11994245648384094),
  ('HPL', 0.060638200491666794)]]

In [21]:
# sort the predictions by author for the submission
predictions_sorted = []
for prediction in predictions:
    predictions_sorted.append(sorted(prediction))

In [22]:
# getting the predictions out of the tuples 
clean_predictions = []

for prediction in predictions_sorted:
    predictions_only = []
    
    for tupel in prediction:
        predictions_only.append(tupel[1])

    clean_predictions.append(predictions_only)

In [23]:
# predictions sorted by author name
clean_predictions[:10]

[[0.07191196829080582, -0.044287968426942825, 0.23805780708789825],
 [0.2845698595046997, 0.060638200491666794, 0.11994245648384094],
 [0.26888197660446167, 0.3387783467769623, 0.1766926646232605],
 [0.31672346591949463, 0.3342685401439667, -0.07852844893932343],
 [-0.006688646972179413, -0.039688438177108765, 0.004600008949637413],
 [0.3323460519313812, 0.26446446776390076, 0.29063814878463745],
 [-0.0763876661658287, -0.053300946950912476, -0.1335180252790451],
 [-0.2676931619644165, 0.10478182882070541, 0.002795552834868431],
 [0.39022552967071533, 0.2022542953491211, -0.142977774143219],
 [0.21931785345077515, -0.08599946647882462, 0.18940061330795288]]

In [24]:
# convert list of predictions to array
predictions_array = np.array(clean_predictions)

In [25]:
# for Kaggle: replace max of each row with 1, rest with 0
preds_array_ones = (predictions_array == predictions_array.max(axis=1)[:,None]).astype(int)

In [26]:
# construct clean dataframe for Kaggle submission
df_preds = pd.DataFrame(preds_array_ones, columns = ['EAP', 'HPL', 'MWS'], index=test.id)

In [27]:
df_preds.head(5)

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0,0,1
id24541,1,0,0
id00134,0,1,0
id27757,0,1,0
id04081,0,0,1


In [28]:
# to file
df_preds.to_csv('submissions/doc2vec.csv')