In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

In [2]:
import numpy as np
import pandas as pd
import pickle
import re
import spacy
from sklearn.model_selection import train_test_split
from collections import Counter
import itertools
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 300

## Load and clean the Gutenberg data

In [4]:
works = pd.read_csv('data/works.csv', encoding='latin1')
works.shape

(1661600, 4)

In [5]:
works = works[works['author'] == 'Austen, Jane'].copy()
works.reset_index(drop=True, inplace=True)
works.shape

(167990, 4)

In [6]:
# Replace NAs with paragraph breaks
works['text'] = works['text'].fillna('[PARA]')

In [7]:
# Split by paragraph
def split_by_paragraph(strings):
    # Join strings
    strings = ' '.join(strings)

    # Split by paragraph break
    strings = strings.split('[PARA]')
    
    # Clean up
    strings = [s.strip() for s in strings]
    strings = [s for s in strings if s]
    return strings

works = works.groupby(['gutenberg_id', 'title', 'author'])['text'].apply(lambda strings: pd.Series(split_by_paragraph(strings)))
works = pd.DataFrame(works).reset_index()

In [8]:
# Check paragraph length
works['text'].str.len().describe()

count    23467.000000
mean       386.229301
std        488.584749
min          1.000000
25%         95.000000
50%        239.000000
75%        508.000000
max      14695.000000
Name: text, dtype: float64

In [9]:
# Remove paragraphs shorter than 50 characters
works = works[works['text'].str.len() >= 50].copy()
works.shape

(20246, 5)

In [10]:
# Clean the text data
def clean_text(text):
    text = text.lower()
    text = re.sub('’', '\'', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip()
    return text

works['text'] = works['text'].apply(clean_text)

## Tokenize

In [11]:
spacy_nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner', 'textcat'])

In [12]:
txts = works['text'].tolist()
toks = []
for txt in spacy_nlp.pipe(txts, batch_size=10000, n_threads=4):
    toks.append([tok.text for tok in txt])
works['toks'] = toks

In [13]:
works.to_pickle('data/works.pkl')

## Split into training and validation sets

In [28]:
works = pd.read_pickle('data/works.pkl')

In [29]:
toks_trn, toks_val = train_test_split(works['toks'].tolist(), test_size=0.2, random_state=0)
len(toks_trn), len(toks_val)

(16196, 4050)

In [30]:
# Add BOS and EOS
BOS = '_bos_'
EOS = '_eos_'

toks_trn = [[BOS] + toks + [EOS] for toks in toks_trn]
toks_val = [[BOS] + toks + [EOS] for toks in toks_val]

## Map tokens to indices

In [31]:
UNK = '_unk_'
PAD = '_pad_'

def create_mapper(toks, max_vocab=100000, min_freq=5, UNK=UNK, PAD=PAD, BOS=BOS, EOS=EOS):
    toks_freq = Counter(toks)
    
    # Create index to string mapper
    itos = [s for s, c in toks_freq.most_common(max_vocab) if c >= min_freq]
    
    # Add special tokens to the front
    itos = [PAD, UNK] + itos
    
    # Create string to index mapper
    stoi = {s: i for i, s in enumerate(itos)}
    
    return stoi, itos

In [32]:
# Combine all tokens from the training data
toks_trn_all = list(itertools.chain(*toks_trn))
len(toks_trn_all)

1575711

In [33]:
# Create mappers using the training data
stoi, itos = create_mapper(toks_trn_all)
len(stoi), len(itos)

(7902, 7902)

In [34]:
itos[:10]

['_pad_', '_unk_', ',', '.', 'the', 'to', 'and', 'of', '"', 'a']

In [35]:
itos[-10:]

['banners',
 'outer',
 'complains',
 'pupil',
 'donkey',
 'tragic',
 'authors',
 'birthday',
 'bickerton',
 '1811']

In [36]:
# Map tokens to indices
ixs_trn = [[stoi[tok] if tok in stoi else stoi[UNK] for tok in toks] for toks in toks_trn]
ixs_val = [[stoi[tok] if tok in stoi else stoi[UNK] for tok in toks] for toks in toks_val]

## Truncate and pad sequences

In [37]:
# Check sequence lengths in the training data
pd.Series([len(ixs) for ixs in ixs_trn]).quantile(np.arange(0, 1.1, .1))

0.0       7.0
0.1      23.0
0.2      31.0
0.3      41.0
0.4      53.0
0.5      67.0
0.6      83.0
0.7     107.0
0.8     141.0
0.9     205.0
1.0    3006.0
dtype: float64

In [38]:
# Pick 200
max_len = 200

In [39]:
ixs_trn = pad_sequences(ixs_trn, max_len, padding='post', truncating='post', value=stoi[PAD])
ixs_val = pad_sequences(ixs_val, max_len, padding='post', truncating='post', value=stoi[PAD])

ixs_trn.shape, ixs_val.shape

((16196, 200), (4050, 200))

In [40]:
# Sanity check
[' '.join(toks) for toks in np.array(itos)[ixs_val[:2]]]

['_bos_ " perhaps , " said darcy , " i should have judged better , had i sought an introduction ; but i am ill - qualified to recommend myself to strangers . " _eos_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ 

In [41]:
# Save
data = {
    # Original data
    'toks_trn': toks_trn,
    'toks_val': toks_val,
    
    # Processed data
    'ixs_trn': ixs_trn,
    'ixs_val': ixs_val,
    
    # Meta data
    'stoi': stoi,
    'itos': itos,
    'UNK': UNK,
    'PAD': PAD,
    'BOS': BOS,
    'EOS': EOS,
    'max_len': max_len
}

pickle.dump(data, open('data/works_proc.pkl', 'wb'))