In [1]:
import string
import requests
import re

In [2]:
r = requests.get('https://www.gutenberg.org/files/1342/1342-0.txt')
start = r'(Chapter 1)[^0-9]'
end = '\*\*\* END OF THE PROJECT GUTENBERG EBOOK'

Rules for processing
- [ ] Replace non-unicode (I think) with unicode e.g. apostrophes
- [ ] Find foreign words and hardcode replacement - can do this by looking for special characters
- [ ] Add '—' i.e. long dash for punctuation
- [ ] Go through hypens between words - looks like they are all hyphenated words
- [ ] Replace `-` with `<HYPEN>`, `'` with `<APOSTROPHE>`, then surround punctuation with spaces, then replace these with the punctuation 
- [ ] This should also be done for `Mr.`, `Mrs.`

In [17]:
def preprocess(txt, chars, replace_words, new_punc):
    
    txt = '\n'.join([i.strip() for i in txt.split('\n')])
    
    for word, new_word in replace_words.items():
        txt = txt.replace(word, new_word)
    
    
    punc_dict = {"'": '<APOSTROPHE>', '-': '<HYPHEN>'}
    
    def replace_punc(x):
        before, punc, after = x.groups()
        return r'{}{}{}'.format(before, punc_dict[punc], after)
    
    punc_pattern = r"({0})([{1}])({0})".format(
        '[a-zA-Z{}]+'.format(chars),
        '|'.join(punc_dict)
    )
    
    changed = True
    
    tmp = txt

    
    # may need several passes e.g. for word with > 2 parts like `half-an-hour` 
    while changed:
        tmp = re.sub(punc_pattern, replace_punc, tmp)
        changed = tmp != txt
        txt = tmp
        
    inv_punc_dict = {'<APOSTROPHE>': " '", '<HYPHEN>': ' - ', '<DOT>':'. '}
    
    # Get rid of the angle brackets and add '^' to get '^TAG1|^TAG2|...' so that 
    # TAG is excluded
    not_punc_tag = '|'.join(['^' + i for i in inv_punc_dict]).strip('<>')
    
    punc_replace = '\\' + "|\\".join(string.punctuation + new_punc)
    punc_replace = punc_replace.replace(
        '<', '<[{}]'.format(not_punc_tag)).replace(
        '>', '[{}]>'.format(not_punc_tag))
    
    
    
    txt = re.sub('({})'.format(punc_replace), r' \1 ', txt)
    
    
        
    punc_tag_replace = re.compile('({})'.format('|'.join(inv_punc_dict)))
    
    def replace_punc_tag(x):
        return inv_punc_dict[x.groups()[0]]
    
    txt = punc_tag_replace.sub(replace_punc_tag, txt)
    
    lines = map(str.strip, txt.split('\n'))
    txt = '\n'.join(' '.join(map(str.strip, line.split(' '))) for line in lines)
    
    return txt

In [18]:
text = bytes(r.text, r.encoding).decode()

In [19]:
s = re.finditer(r'(Chapter 1)[^0-9]', text)
start_idx = list(s)[-1].span()[0]
end_idx = re.search(end, text).span()[0]
book = text[start_idx:end_idx].strip()

In [20]:
replace_dict = {
    '’': "'",
    '‘': "'",
    '”': '"',
    '“': '"',
    'Mr.': 'Mr<DOT>',
    'Mrs.': 'Mrs<DOT>',
    'Esq.': 'Esq<DOT>',
    'etc.': 'etc<DOT>'
}

In [21]:
book2 = (preprocess(book, chars='êéà', replace_words=replace_dict, new_punc='—'))

In [22]:
len([i.strip() for i in book2.split()]) 

144959

In [23]:
len(set([i.strip() for i in book2.split()]))

6701

In [26]:
with open('train_pride_and_prejudice.txt', 'w') as f:
    f.write(book2)
    
# with open('train_pride_and_prejudice_no_empty_lines.txt', 'w') as f:
#     lines = map(str.strip, book2.split('\n'))
#     new_lines = []
#     n_empty = 0
#     for line in lines:
#         if n_empty > 1:
#             continue
#         if len(line) == 0:
#             n_empty +=1
#         else:
#             n_empty = 0
#         new_lines.append(line)
        

## Archive

In [562]:
punc_dict = {"'": '<APOSTROPHE>', '-': '<HYPHEN>', '.':'<DOT>'}
inv_punc_dict = {'<APOSTROPHE>': " '", '<HYPHEN>':" ", '<DOT>': '. '}
def replace_punc(x):
    return r'{}{}{}'.format(x.groups()[0], punc_dict[x.groups()[1]], x.groups()[2])
bb = a
while True:
    bb2 = re.sub(r"({0})([{1}])({0})".format('[a-zA-Zêéà]+', '|'.join("-'.")), replace_punc, bb)
    if bb2 == bb:
        break
    bb = bb2
bb == z
bb

'Mrs<DOT>Elton/ Mr<DOT>Weston didn<APOSTROPHE>t, it<APOSTROPHE>s, tête<HYPHEN>à<HYPHEN>tête!'

In [563]:
s = '\\' + "|\\".join(string.punctuation)
s = s.replace('<', '<[^APOSTROPHE|^HYPHEN|^DOT]').replace('>', '[^APOSTROPHE|^HYPHEN|^DOT]>')

In [564]:
x = re.sub(
    '({})'.format('|'.join(map('\<{}\>'.format, ('APOSTROPHE', 'HYPHEN', 'DOT')))),
    lambda x: inv_punc_dict[x.groups()[0]],
    re.sub(f'({s})', r' \1 ', bb)
)

' '.join([i.strip() for i in x.split()])

#.replace(r'<APOSTROPHE>', ' \'').replace(r'<HYPHEN>', ' ')

"Mrs. Elton / Mr. Weston didn 't , it 's , tête à tête !"

In [557]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
def process_text(txt):
    translator = string.maketrans(string.punctuation, ' '*len(string.punctuation))
    txt = txt.translate(translator)