In [25]:
import os

from argparse import Namespace
import collections
import nltk.data
import numpy as np
import pandas as pd
import re
import string
from tqdm import tqdm_notebook

In [26]:
args = Namespace(
    raw_dataset_txt="data/book/frankenstein.txt",
    window_size=5,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="./data/book/frankenstein_with_splits.csv",
    seed=1337
)

In [27]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
with open(args.raw_dataset_txt) as fp:
    book = fp.read()
sentences = tokenizer.tokenize(book)


In [28]:
print(len(sentences), 'sentences')
print('Sampels', sentences[100])

3430 sentences
Sampels This letter will reach England by a merchantman now on
its homeward voyage from Archangel; more fortunate than I, who may not
see my native land, perhaps, for many years.


In [29]:
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split())
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [30]:
cleaned_sentences = [preprocess_text(sentence) for sentence in sentences]

In [31]:
MASK_TOKEN = '<MASK>'

In [32]:
flatten = lambda outer_list: [item for inner_list in outer_list for item in inner_list]
windows = flatten([list(nltk.ngrams([MASK_TOKEN] * args.window_size + sentence.split(' ') + \
    [MASK_TOKEN] * args.window_size, args.window_size * 2 + 1)) \
    for sentence in tqdm_notebook(cleaned_sentences)])

data = []
for window in tqdm_notebook(windows):
    target_token = window[args.window_size]
    context = []
    for i, token, in enumerate(window):
        if token == MASK_TOKEN or i == args.window_size:
            continue
        context.append(token)
    data.append([' '.join(token for token in context), target_token])

cbow_data = pd.DataFrame(data, columns=['context', 'target'])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sentence in tqdm_notebook(cleaned_sentences)])


  0%|          | 0/3430 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for window in tqdm_notebook(windows):


  0%|          | 0/90808 [00:00<?, ?it/s]

In [33]:
n = len(cbow_data)
def get_split(row_num):
    if row_num <= n*args.train_proportion:
        return 'train'
    elif (row_num > n*args.train_proportion) and (row_num <= n*args.train_proportion + n*args.val_proportion):
        return 'val'
    else:
        return 'test'
cbow_data['split']= cbow_data.apply(lambda row: get_split(row.name), axis=1)

In [34]:
cbow_data.head()

Unnamed: 0,context,target,split
0,"gutenberg s frankenstein , by",project,train
1,"project s frankenstein , by mary",gutenberg,train
2,"project gutenberg frankenstein , by mary wolls...",s,train
3,"project gutenberg s , by mary wollstonecraft g...",frankenstein,train
4,project gutenberg s frankenstein by mary wolls...,",",train


In [35]:
cbow_data.to_csv(args.output_munged_csv, index='false')