# Synpsis

Corpus importer for Jane Austen's *Persuasion*.

# Configuration

In [1]:
body_start = 684
body_end = 67303
chap_pat = r'^\s*(?:VOLUME|BOOK|CHAPTER).*$'
para_pat = r'\n\n+'
sent_pat = r'([.;?!"“”]+)'
token_pat = r'([\W_]+)'
db_file = 'Les_Misérables.db'
src_file_name = 'data/Les_Misérables.txt'

In [2]:
extra_stopwords = """
us rest went least would much must long one like much say well without though yet might still upon
done every rather particular made many previous always never thy thou go first oh thee ere ye came
almost could may sometimes seem called among another also however nevertheless even way one two three
ever put
""".strip().split()

In [3]:
set(extra_stopwords)

In [4]:
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']
CHAPS = OHCO[:1]
PARAS = OHCO[:2]
SENTS = OHCO[:3]

# Libraries

In [5]:
import re
import os
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('tagsets')
# nltk.download('wordnet')

# Pragmas

In [6]:
%matplotlib inline

# Process

We pause to look at the revised form of our text import function. The parsing function has been replaced with NLTK, which has improved the results of POS tagging. However, this has required some added string manipulation to produce better tokens.

## Text to lines

In [7]:
lines = open(src_file_name, 'r', encoding='utf-8').readlines()
lines = lines[body_start - 1 : body_end + 1]
df = pd.DataFrame({'line_str':lines})
df.index.name = 'line_id'
del(lines)

## Fix some characters to improve tokenization

In [8]:
df.line_str = df.line_str.str.replace('—', ' — ')
df.line_str = df.line_str.str.replace('-', ' - ')
df.line_str = df.line_str.str.replace("'", " ' ")

## Lines to Chapters

In [9]:
chap_mask = df.line_str.str.match(chap_pat)
df.loc[chap_mask, 'chap_id'] = df.apply(lambda x: x.name, 1)

In [10]:
df.chap_id = df.chap_id.ffill().astype('int')
chap_ids = df.chap_id.unique().tolist()
df['chap_num'] = df.chap_id.apply(lambda x: chap_ids.index(x))
chaps = df.groupby('chap_num')\
    .apply(lambda x: ''.join(x.line_str))\
    .to_frame()\
    .rename(columns={0:'chap_str'})
del(df)

## Chapters to Paragraphs

In [11]:
paras = chaps.chap_str.str.split(para_pat, expand=True)\
    .stack()\
    .to_frame()\
    .rename(columns={0:'para_str'})
paras.index.names = PARAS
paras.para_str = paras.para_str.str.strip()
paras.para_str = paras.para_str.str.replace(r'\n', ' ')
paras.para_str = paras.para_str.str.replace(r'\s+', ' ')
paras = paras[~paras.para_str.str.match(r'^\s*$')]
del(chaps)

## Paragraphs to Sentences

In [12]:
#     sents = paras.para_str.str.split(sent_pat, expand=True)\
sents = paras.para_str\
    .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
    .stack()\
    .to_frame()\
    .rename(columns={0:'sent_str'})
sents.index.names = SENTS
del(paras)

## Sentences to Tokens with POS tagging

In [13]:
# tokens = sents.sent_str.str.split(token_pat, expand=True)\
tokens = sents.sent_str\
    .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
    .stack()\
    .to_frame()\
    .rename(columns={0:'pos_tuple'})
tokens.index.names = OHCO
tokens['pos'] = tokens.pos_tuple.apply(lambda x: x[1])
tokens['token_str'] = tokens.pos_tuple.apply(lambda x: x[0])
tokens = tokens.drop('pos_tuple', 1)
del(sents)

## Tag punctuation and numbers

In [14]:
tokens['punc'] = tokens.token_str.str.match(r'^[\W_]*$').astype('int')
tokens['num'] = tokens.token_str.str.match(r'^.*\d.*$').astype('int')

## Extract vocab with minimal normalization

In [15]:
WORDS = (tokens.punc == 0) & (tokens.num == 0)
tokens.loc[WORDS, 'term_str'] = tokens.token_str.str.lower()\
    .str.replace(r'["_*.]', '')
vocab = tokens[tokens.punc == 0].term_str.value_counts().to_frame()\
    .reset_index()\
    .rename(columns={'index':'term_str', 'term_str':'n'})
vocab = vocab.sort_values('term_str').reset_index(drop=True)
vocab.index.name = 'term_id'

## Get priors for Vocab

In [16]:
vocab['p'] = vocab.n / vocab.n.sum()

## Add stems

In [17]:
stemmer = nltk.stem.porter.PorterStemmer()
vocab['port_stem'] = vocab.term_str.apply(lambda x: stemmer.stem(x))

## Define stopwords

In [18]:
stopwords = set(nltk.corpus.stopwords.words('english') + extra_stopwords)

In [19]:
stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'almost',
 'also',
 'always',
 'am',
 'among',
 'an',
 'and',
 'another',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'called',
 'came',
 'can',
 'could',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'done',
 'down',
 'during',
 'each',
 'ere',
 'even',
 'ever',
 'every',
 'few',
 'first',
 'for',
 'from',
 'further',
 'go',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'however',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'least',
 'like',
 'll',
 'long',
 'm',
 'ma',
 'made',
 'many',
 'may',
 'me',
 'might',
 'mightn',
 "mightn't",
 'more',
 'm

In [20]:
sw = pd.DataFrame({'x':1}, index=stopwords)
vocab['stop'] = vocab.term_str.map(sw.x).fillna(0).astype('int')
del(sw)

## Add term_ids to Tokens 

In [21]:
tokens['term_id'] = tokens['term_str'].map(vocab.reset_index()\
    .set_index('term_str').term_id).fillna(-1).astype('int')

# Save

In [22]:
with sqlite3.connect(db_file) as db:
    tokens.to_sql('token', db, if_exists='replace', index=True)
    vocab.to_sql('vocab', db, if_exists='replace', index=True)

In [23]:
# END