In [171]:
import pandas as pd
import numpy as np

In [119]:
TEXT_COLUMN = 'text'
Y_COLUMN = 'author'
TRAIN_DATA_FILE = "train_data_spooky_author.csv"
SMALLER_SAMPLE_SIZE = 5000

Read in the DF, but split out the first 500 of each author (so reduce our dataset to 1500 elements

In [129]:
all_texts = pd.read_csv(TRAIN_DATA_FILE)
gb = all_texts.groupby(Y_COLUMN)
train_df = pd.DataFrame()
validation_df = pd.DataFrame()
for author, df in gb:
    train_df=pd.concat([train_df, df.head(SMALLER_SAMPLE_SIZE)])
    validation_df = pd.concat([validation_df, df.iloc[SMALLER_SAMPLE_SIZE:SMALLER_SAMPLE_SIZE+1000]])

train_df.sort_index(inplace=True)
validation_df.sort_index(inplace=True)

id        True
text      True
author    True
dtype: bool

use SpaCy to parse the words in the df into SpaCy.doc objects and add them to the smaller text sample dataframe

In [131]:
import spacy

NLP = spacy.load('en', disable=['parser', 'ner'])
doc = [d for d in NLP.pipe(train_df[TEXT_COLUMN].values)]
train_df['doc'] = doc

Using the doc that we just added to the dataframe, get the parts of speech for each text sample.
Add a list of the parts of speech to the dataframe.

In [132]:
poslist = train_df['doc'].apply(lambda doc: " ".join([token.pos_ for token in doc]))
train_df['parts_of_speech'] = poslist
train_df.head()

Unnamed: 0,id,text,author,doc,parts_of_speech
0,id26305,"This process, however, afforded me no means of...",EAP,"(This, process, ,, however, ,, afforded, me, n...",DET NOUN PUNCT ADV PUNCT VERB PRON DET NOUN AD...
1,id17569,It never once occurred to me that the fumbling...,HPL,"(It, never, once, occurred, to, me, that, the,...",PRON ADV ADV VERB ADP PRON ADP DET NOUN VERB V...
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"(In, his, left, hand, was, a, gold, snuff, box...",ADP ADJ ADJ NOUN VERB DET ADJ NOUN NOUN PUNCT ...
3,id27763,How lovely is spring As we looked from Windsor...,MWS,"(How, lovely, is, spring, As, we, looked, from...",ADV ADJ VERB NOUN ADP PRON VERB ADP PROPN PROP...
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,"(Finding, nothing, else, ,, not, even, gold, ,...",VERB NOUN ADV PUNCT ADV ADV NOUN PUNCT DET PRO...


In [191]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1,2))
bigram_vector = vectorizer.fit_transform(train_df['parts_of_speech']).toarray()
bigram_word_vector = vectorizer.fit_transform(train_df[TEXT_COLUMN]).toarray()


In [135]:
bigram_vector.shape

(1500, 175)

Create my test function, which runs a 5-fold stratified split and then validates on the pipeline

In [192]:
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
X=bigram_word_vector
y=train_df[Y_COLUMN]
rskf = StratifiedKFold(n_splits=5, random_state=1)
losses, accuracy = [], []
nlp_pipeline = LogisticRegression()
rskf

for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        assert not y_train.isnull().any()
        nlp_pipeline.fit(X_train, y_train)
        predictions = nlp_pipeline.predict(X_test)
        
        accuracy.append(metrics.accuracy_score(y_test, predictions))
        losses.append(metrics.log_loss(y_test, nlp_pipeline.predict_proba(X_test)))

print(f'kfolds log losses: {str([str(round(x, 3)) for x in sorted(losses)])}')
print(f'mean log loss: {round(pd.np.mean(losses), 3)}')
print(f'kfolds accuracy: {str([str(round(x, 3)) for x in sorted(accuracy)])}')
print(f'mean accuracy: {round(pd.np.mean(accuracy), 3)}')

kfolds log losses: ['0.766', '0.776', '0.809', '0.858', '0.916']
mean log loss: 0.825
kfolds accuracy: ['0.587', '0.637', '0.637', '0.653', '0.69']
mean accuracy: 0.641
