# Synopsis

We attempt a Naive Bayes classifier to predict authorship of the Federalist papers.

#  Configuration

In [205]:
base_path = '/Users/rca2t/COURSES/DSI/DS5559/UVA_DSI_REPO'
local_lib = base_path + '/lib'
src_dir = base_path + '/labs/2019-02-28_Lab07/vierthaler-stylometry/fedpapers'

# Libraries

In [206]:
import glob
import re

import pandas as pd
from numpy.random import randint
import sys; sys.path.append(local_lib)
import textman.textman as tx

# Process

## Import raw review data

In [207]:
files = glob.glob(src_dir+'/*.txt')
corpus = pd.DataFrame([f.replace('.txt','').split('/')[-1].split('_') for f in files], 
                    columns=['chap_id','author'])
corpus['doc_content'] = [open(f, 'r', encoding='utf-8').read() for f in files]
corpus.chap_id = corpus.chap_id.astype('int')
corpus = corpus.set_index('chap_id')

In [208]:
corpus.head()

Unnamed: 0_level_0,author,doc_content
chap_id,Unnamed: 1_level_1,Unnamed: 2_level_1
55,Unknown,\n\n\n\nThe Total Number of the House of Repre...
49,Unknown,\n\n\n\nMethod of Guarding Against the Encroac...
10,Madison,\n\n\n\nThe Same Subject Continued\n\n(The Uni...
78,Hamilton,\n\n\n\nThe Judiciary Department\n\nFrom McLEA...
80,Hamilton,\n\nThe Powers of the Judiciary\n\nFrom McLEAN...


In [209]:
tokens, vocab = tx.create_tokens_and_vocab(corpus, src_col='doc_content')
tokens = tokens.join(corpus.author)

In [244]:
vocab.head()

Unnamed: 0_level_0,term,n,f,stem,sw,go
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,2,1e-05,0,True,False
1,1,65,0.000339,1,True,False
2,10,3,1.6e-05,10,True,False
3,11,6,3.1e-05,11,True,False
4,12,2,1e-05,12,True,False


In [210]:
tokens.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,token,term_str,term_id,author
chap_id,sent_id,token_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,30,22,evince,evince,3094,Hamilton
18,90,7,seemed,seemed,7111,Madison
59,2,4,People,people,5752,Hamilton
20,68,24,contingencies,contingencies,1803,Madison
29,32,5,day,day,2038,Hamilton
78,120,8,courts,courts,1959,Hamilton
63,46,9,distinguish,distinguish,2625,Unknown
36,72,12,regulation,regulation,6628,Hamilton
43,19,35,needful,needful,5305,Madison
4,65,32,never,never,5330,Jay


In [303]:
LM = tokens.groupby('author').term_id.value_counts().to_frame().unstack().T.fillna(0)
LM.index = LM.index.droplevel(0)
LMP = (LM + 1) / (LM.sum() + len(vocab)) # Likelihoods
LML = np.log10(LMP[LMP > 0]).fillna(0)

In [327]:
LML.head()

author,Hamilton,Jay,Madison,Unknown
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
61,-4.158694,-4.09618,-3.97322,-4.285737
62,-4.158694,-4.09618,-4.450342,-4.285737
67,-4.459724,-4.09618,-3.848282,-4.285737
70,-3.915656,-3.619059,-4.450342,-4.285737
71,-3.915656,-4.09618,-4.450342,-3.984707


In [328]:
# Priors
# PRIORS = (LM.sum() / LM.sum().sum()).to_frame().rename(columns={0:'p'})
PRIORS = pd.DataFrame(corpus.author.value_counts() / len(corpus))
PRIORS.columns = ['p']
PRIORS['logp'] = np.log10(PRIORS.p)

In [329]:
PRIORS

Unnamed: 0,p,logp
Hamilton,0.6,-0.221849
Madison,0.2,-0.69897
Unknown,0.141176,-0.850238
Jay,0.058824,-1.230449


In [330]:
D = tokens.groupby(['chap_id']).term_id.value_counts().to_frame()\
    .rename(columns={'term_id':'n'}).unstack().fillna(0)
D.columns = D.columns.droplevel(0)
D = D.sort_index()

In [331]:
DBOOL = D.astype('bool').astype('int')

In [332]:
D.head()

term_id,61,62,67,70,71,73,75,76,79,87,...,8678,8680,8682,8685,8686,8687,8690,8692,8696,8698
chap_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0


In [341]:
results = []
AUTH = 'Unknown'
# AUTH = 'Hamilton'
# AUTH = 'Madison'
# AUTH = 'Jay'

for doc_id in corpus[corpus.author == AUTH].index.values:
    for author in ['Jay', 'Madison', 'Hamilton']:
        x = (DBOOL.loc[doc_id] * LML[author]).sum() + PRIORS.loc[author, 'logp']
        results.append((doc_id, author, x))
        
df = pd.DataFrame(results, columns=['doc_id','author','prop'])
df = df.set_index(['doc_id','author'])
df = df.unstack()
df.columns = df.columns.droplevel(0)
df['prediction'] = df.idxmax(1)

In [342]:
df

author,Hamilton,Jay,Madison,prediction
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
49,-1611.63077,-1710.69113,-1623.947499,Hamilton
50,-1159.96842,-1222.609232,-1167.164884,Hamilton
51,-1668.618724,-1788.330292,-1677.316433,Hamilton
52,-1620.979048,-1715.152657,-1631.63652,Hamilton
53,-1772.258927,-1878.914312,-1791.658329,Hamilton
54,-1517.565803,-1623.335222,-1532.636542,Hamilton
55,-1812.069377,-1922.381768,-1833.451988,Hamilton
56,-1290.497025,-1372.57128,-1298.048667,Hamilton
57,-1944.789654,-2051.911864,-1967.640285,Hamilton
58,-1926.850691,-2026.405678,-1938.958855,Hamilton
