# Synpsis

Use case: Import source text and save in F3 form.

# Configuration

In [1]:
db_file = 'speeches.db'

obama_body_start = 51
obama_body_end = 5618
obama_chap_pat = r'^\s*(?:Address).*$'

para_pat = r'\n\n+'
sent_pat = r'([.;?!"“”]+)'
token_pat = r'([\W_]+)'

bush_body_start = 56
bush_body_end = 4388
bush_chap_pat = r'^\s*(?:State).*$'

bush_url = 'http://www.gutenberg.org/cache/epub/5049/pg5049.txt'
obama_url = 'http://www.gutenberg.org/cache/epub/50950/pg50950.txt'

bush_src_file_name = 'pg5049.txt'
obama_src_file_name = 'pg50950.txt'

In [2]:
#NLTK didnt think these were stop word but we do so we can take them out.
extra_stopwords = set("""
us rest went least would much must long one like much say well without though yet might still upon
done every rather particular made many previous always never thy thou go first oh thee ere ye came
almost could may sometimes seem called among another also however nevertheless even way one two three
ever put
""".strip().split())

In [3]:
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']
CHAPS = OHCO[:1]
PARAS = OHCO[:2]
SENTS = OHCO[:3]

# Libraries

In [4]:
import re
import os
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('tagsets')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/rca2t/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rca2t/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/rca2t/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package tagsets to /Users/rca2t/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/rca2t/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Pragmas

In [5]:
%matplotlib inline

# Process

We pause to look at the revised form of our text import function. The parsing function has been replaced with NLTK, which has improved the results of POS tagging. However, this has required some added string manipulation to produce better tokens.

## Download if necessary

In [6]:
def get_file(src_file_name, src_file_url):
    if not os.path.exists(src_file_name):
        import requests
        with open(src_file_name, 'w', encoding='utf-8') as src_file_on_disk:
            src_file_text = requests.get(src_file_url).text
            src_file_on_disk.write(src_file_text)

In [7]:
get_file(bush_src_file_name, bush_url)
get_file(obama_src_file_name, obama_url)

In [8]:
!ls -l

total 1352
-rw-r--r--@ 1 rca2t  staff    4816 Mar 30 09:58 WordEmbedding.ipynb
-rw-r--r--@ 1 rca2t  staff  266307 Mar 30 10:08 pg5049.txt
-rw-r--r--@ 1 rca2t  staff  336829 Mar 30 09:59 pg50950.txt
-rw-r--r--@ 1 rca2t  staff   69855 Mar 30 10:25 tolu.ipynb


## Text to lines

In [9]:
blines = open(bush_src_file_name, 'r', encoding='utf-8').readlines()
blines = blines[bush_body_start - 1 : bush_body_end + 1]
olines = open(obama_src_file_name, 'r', encoding='utf-8').readlines()
olines = olines[obama_body_start - 1 : obama_body_end + 1]

In [10]:
bush_df = pd.DataFrame({'line_str':blines})
bush_df.index.name = 'line_id'
obama_df = pd.DataFrame({'line_str':olines})
obama_df.index.name = 'line_id'

In [11]:
bush_df.head()

Unnamed: 0_level_0,line_str
line_id,Unnamed: 1_level_1
0,State of the Union Address\n
1,George W. Bush\n
2,"February 27, 2001\n"
3,\n
4,"Mr. Speaker, Mr. Vice President, Members of Co..."


In [12]:
obama_df.head()

Unnamed: 0_level_0,line_str
line_id,Unnamed: 1_level_1
0,Address Before a Joint Session of the Congress\n
1,Barack Obama\n
2,"February 24, 2009\n"
3,\n
4,"Madam Speaker, Mr. Vice President, Members of ..."


## Fix some characters to improve tokenization

In [13]:
bush_df.line_str = bush_df.line_str.str.replace('—', ' — ')
bush_df.line_str = bush_df.line_str.str.replace('-', ' - ')

obama_df.line_str = obama_df.line_str.str.replace('—', ' — ')
obama_df.line_str = obama_df.line_str.str.replace('-', ' - ')

## Lines to Chapters

In [14]:
def get_chaps(df, chap_pat):
    chap_mask = df.line_str.str.match(chap_pat)
    df.loc[chap_mask, 'chap_id'] = df.apply(lambda x: x.name, 1)
    df.chap_id = df.chap_id.ffill().astype('int')
    chap_ids = df.chap_id.unique().tolist()
    df['chap_num'] = df.chap_id.apply(lambda x: chap_ids.index(x))
    chaps = df.groupby('chap_num')\
        .apply(lambda x: ''.join(x.line_str))\
        .to_frame()\
        .rename(columns={0:'chap_str'})
    return chaps

In [15]:
bush_chaps = get_chaps(bush_df, bush_chap_pat)
obama_chaps = get_chaps(obama_df, obama_chap_pat)

## Chapters to Paragraphs

In [18]:
def get_paras(chaps, para_pat):
    paras = chaps.chap_str.str.split(para_pat, expand=True)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'para_str'})
    paras.index.names = PARAS
    paras.para_str = paras.para_str.str.strip()
    paras.para_str = paras.para_str.str.replace(r'\n', ' ')
    paras.para_str = paras.para_str.str.replace(r'\s+', ' ')
    paras = paras[~paras.para_str.str.match(r'^\s*$')]
    return paras

In [19]:
bush_paras = get_paras(bush_chaps, para_pat)
obama_paras = get_paras(obama_chaps, para_pat)

In [22]:
bush_paras.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
0,0,State of the Union Address George W. Bush Febr...
0,1,"Mr. Speaker, Mr. Vice President, Members of Co..."
0,2,I thank you for your invitation to speak here ...
0,3,"The last time I visited the Capitol, I came to..."
0,4,America today is a Nation with great challenge...


In [21]:
obama_paras.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
0,0,Address Before a Joint Session of the Congress...
0,1,"Madam Speaker, Mr. Vice President, Members of ..."
0,2,I know that for many Americans watching right ...
0,3,But while our economy may be weakened and our ...
0,4,The weight of this crisis will not determine t...


## Paragraphs to Sentences

In [25]:
def get_sents(paras):
    sents = paras.para_str\
        .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})
    sents.index.names = SENTS
    return sents

In [26]:
bush_sents = get_sents(bush_paras)
obama_sents = get_sents(obama_paras)

In [27]:
bush_sents.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
0,0,0,State of the Union Address George W. Bush Febr...
0,1,0,"Mr. Speaker, Mr. Vice President, Members of Co..."
0,2,0,I thank you for your invitation to speak here ...
0,2,1,I know Congress had to formally invite me and ...
0,2,2,"So, Mr. Vice President, I appreciate you being..."


In [28]:
obama_sents.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
0,0,0,Address Before a Joint Session of the Congress...
0,1,0,"Madam Speaker, Mr. Vice President, Members of ..."
0,2,0,I know that for many Americans watching right ...
0,2,1,If you haven't been personally affected by thi...
0,2,2,You don't need to hear another list of statist...


## Sentences to Tokens with POS tagging

In [29]:
def get_tokens(sents):
    tokens = sents.sent_str\
        .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'pos_tuple'})
    tokens.index.names = OHCO
    tokens['pos'] = tokens.pos_tuple.apply(lambda x: x[1])
    tokens['token_str'] = tokens.pos_tuple.apply(lambda x: x[0])
    tokens = tokens.drop('pos_tuple', 1)
    return tokens

In [37]:
bush_tokens = get_tokens(bush_sents)
obama_tokens = get_tokens(obama_sents)

In [38]:
bush_tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,NN,State
0,0,0,1,IN,of
0,0,0,2,DT,the
0,0,0,3,NNP,Union
0,0,0,4,NNP,Address


In [39]:
obama_tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,NN,Address
0,0,0,1,IN,Before
0,0,0,2,DT,a
0,0,0,3,JJ,Joint
0,0,0,4,NN,Session


## Tag punctuation and numbers

In [40]:
def identify_punc_and_num(tokens):
    tokens['punc'] = tokens.token_str.str.match(r'^[\W_]*$').astype('int')
    tokens['num'] = tokens.token_str.str.match(r'^.*\d.*$').astype('int')
    return tokens

In [41]:
bush_tokens = identify_punc_and_num(bush_tokens)
obama_tokens = identify_punc_and_num(obama_tokens)

## Combine token tables

In [43]:
bush_tokens['label'] = 'bush'
obama_tokens['label'] = 'obama'

In [49]:
tokens = pd.concat([bush_tokens, obama_tokens]).reset_index().set_index(['label']+OHCO)

In [51]:
tokens

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,token_str,punc,num
label,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bush,0,0,0,0,NN,State,0,0
bush,0,0,0,1,IN,of,0,0
bush,0,0,0,2,DT,the,0,0
bush,0,0,0,3,NNP,Union,0,0
bush,0,0,0,4,NNP,Address,0,0
bush,0,0,0,5,NNP,George,0,0
bush,0,0,0,6,NNP,W.,0,0
bush,0,0,0,7,NNP,Bush,0,0
bush,0,0,0,8,NNP,February,0,0
bush,0,0,0,9,CD,27,0,1


## Extract vocab with minimal normalization

In [52]:
WORDS = (tokens.punc == 0) & (tokens.num == 0)
tokens.loc[WORDS, 'term_str'] = tokens.token_str.str.lower()\
    .str.replace(r'["_*.]', '')
vocab = tokens[tokens.punc == 0].term_str.value_counts().to_frame()\
    .reset_index()\
    .rename(columns={'index':'term_str', 'term_str':'n'})
vocab = vocab.sort_values('term_str').reset_index(drop=True)
vocab.index.name = 'term_id'

## Get priors for Vocab

In [53]:
vocab['p'] = vocab.n / vocab.n.sum()

## Add stems

In [54]:
stemmer = nltk.stem.porter.PorterStemmer()
vocab['port_stem'] = vocab.term_str.apply(lambda x: stemmer.stem(x))

## Define stopwords

In [56]:
stopwords = set(nltk.corpus.stopwords.words('english') + list(extra_stopwords))

# Stopwords

In [57]:
sw = pd.DataFrame({'x':1}, index=stopwords)
vocab['stop'] = vocab.term_str.map(sw.x).fillna(0).astype('int')
del(sw)

## Add term_ids to Tokens 

In [58]:
tokens['term_id'] = tokens['term_str'].map(vocab.reset_index()\
    .set_index('term_str').term_id).fillna(-1).astype('int')

# Save

In [59]:
with sqlite3.connect(db_file) as db:
    tokens.to_sql('token', db, if_exists='replace', index=True)
    vocab.to_sql('vocab', db, if_exists='replace', index=True)

In [None]:
# END