## Federalist Papers Authorship Attribution
Code by [Jon Luca](https://github.com/jonluca/Federalist-Papers-NLP), modified and annotated by Sarah Chen based off of JonLuca's 2018 [blog post](https://blog.jonlu.ca/posts/the-federalist-papers-author-identification-through-k-means-clustering).

For 1/27 meeting on digital humanities (accompanying slides [here](https://docs.google.com/presentation/d/1VShMCNWyJQcMZM_6OtLe9dVeZt0WZezXYvcbgY103z0/edit?usp=sharing))

In [1]:
import numpy as np
import nltk
nltk.download('punkt')
import glob
import os
import ntpath
import re
import pandas as pd
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from scipy.cluster.vq import whiten
%matplotlib inline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
import random
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

[nltk_data] Downloading package punkt to /Users/sarahchen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Retrieve all papers from the corresponding subfolders and contenate text (accessible via Gutenberg)

In [2]:
papers = r"./papers/"
hamilton = sorted(glob.glob(os.path.join(papers, "hamilton/*")))
madison = sorted(glob.glob(os.path.join(papers, "madison/*")))
disputed = sorted(glob.glob(os.path.join(papers, "disputed/*")))

hamilton_papers = []
for fn in hamilton:
    with open(fn) as f:
        hamilton_papers.append(f.read().replace('\n', ' ').replace('\r',''))
hamilton_papers_all = ' '.join(hamilton_papers)

madison_papers = []
for fn in madison:
    with open(fn) as f:
        madison_papers.append(f.read().replace('\n', ' ').replace('\r',''))
madison_papers_all = ' '.join(madison_papers)

disputed_papers = []
disputed_papers_file_names = []
for fn in disputed:
    with open(fn) as f:
        disputed_papers.append(f.read().replace('\n', ' ').replace('\r',''))
        disputed_papers_file_names.append(ntpath.basename(fn))
disputed_papers_all = ' '.join(disputed_papers)

known_papers_all = hamilton_papers_all + " " + madison_papers_all # string of all papers (text concatenated)
known_papers = hamilton_papers + madison_papers # list of all papers

Generate some lexical features for each paper:
* Average number of words per sentence
* Lexical diversity score: the number of unique words / total number of words
* ; per sentence
* " per sentence
* , per sentence

In [3]:
def LexicalFeatures(papers, all_papers):
    """
    Compute feature vectors for word and punctuation features
    """
    num_papers = len(papers)
    # for every paper, calculate 2 lexical features and 3 punctuation features
    fvs_lexical = np.zeros((len(papers), 2), np.float64)
    fvs_punct = np.zeros((len(papers), 3), np.float64)
    for e, single_paper_text in enumerate(papers):
        # note: the nltk.word_tokenize includes punctuation
        tokens = nltk.word_tokenize(single_paper_text.lower())
        words = word_tokenizer.tokenize(single_paper_text.lower())
        sentences = sentence_tokenizer.tokenize(single_paper_text)
        vocab = set(words)
        words_per_sentence = np.array([len(word_tokenizer.tokenize(s)) for s in sentences])

        # average number of words per sentence
        fvs_lexical[e, 0] = words_per_sentence.mean()
        # Lexical diversity
        fvs_lexical[e, 1] = len(vocab) / float(len(words))

        # Commas per sentence
        fvs_punct[e, 0] = tokens.count(';') / float(len(sentences))
        fvs_punct[e, 1] = tokens.count('"') / float(len(sentences))
        fvs_punct[e, 2] = tokens.count(',') / float(len(sentences))
        

    # apply whitening to decorrelate the features (a form of normalization, scaling each feature by std)
    fvs_lexical = whiten(fvs_lexical)
    fvs_punct = whiten(fvs_punct)

    return fvs_lexical, fvs_punct

Look at the frequencies of parts of speech (POS) within each paper

In [14]:
def SyntacticFeatures(papers, all_papers):

    def token_to_pos(paper):
        """convert a paper to the POS tags for each word"""
        tokens = nltk.word_tokenize(paper)
        return [p[1] for p in nltk.pos_tag(tokens)]

    paper_pos = [token_to_pos(paper) for paper in papers]
    pos_list = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS']
    # counts of how many times each POS appears in a given paper
    fvs_syntax = np.array([[paper.count(pos) for pos in pos_list] for paper in paper_pos]).astype(np.float64)

    # divide the POS counts for row by number of tokens in the paper to yield POS frequencies
    fvs_syntax /= np.c_[np.array([len(paper) for paper in paper_pos])]

    return fvs_syntax


Cluster papers using the generated features with K-means clustering. Very quick [explanation of K-means](https://en.wikitolearn.org/Course:Machine_Learning_for_Humans/Unsupervised_Learning/K-Means_Clustering)

In [5]:
def PredictAuthors(fvs):
    km = KMeans(n_clusters=2, init='k-means++', n_init=100, max_iter=300, verbose=0)
    km.fit(fvs)
    return km


Put it all together. Call the feature-generation functions we have defined on both the papers with known and disputed authorship.

In [16]:
known_set = list(LexicalFeatures(known_papers, known_papers_all))
known_set.append(SyntacticFeatures(known_papers, known_papers_all))

disputed_set = list(LexicalFeatures(disputed_papers, disputed_papers_all))
disputed_set.append(SyntacticFeatures(disputed_papers, disputed_papers_all))



Fit 3 different K-means models to the data for the lexical (vocab), lexical based on punctionation, and syntactical (based on part of speech frequencies) feature sets we generated.

In [23]:
classifications = [PredictAuthors(fvs) for fvs in known_set] 

Examine the 3 sets of results

In [24]:
results = list()
results.append([classifications[0].predict(disputed_set[0]),"Lexical Features"]) # Predict results of Lexical Features
results.append([classifications[1].predict(disputed_set[1]),"Lexical Features - Punctuation"]) # Predict results of Lexical Features, Punctuation
results.append([classifications[2].predict(disputed_set[2]),"Syntactic Features"]) # Predict results of their syntactic feature

In [9]:
all_results = []
for i in range(len(classifications)): # for each data set
    # Extract the label of the first paper
    # We know for a fact Hamilton wrote the first paper, so we can label the cluster containing paper #1 as his
    hamilton = classifications[i].labels_[0] 
    
    # go through all papers
    individual_classifier_results = []
    for j in range(len(results[i][0])):
        if results[i][0][j] == hamilton: 
            individual_classifier_results.append("Hamilton")
        else:
            individual_classifier_results.append("Madison")
    print(individual_classifier_results)
    all_results.append(individual_classifier_results)

['Hamilton', 'Hamilton', 'Madison', 'Madison', 'Madison', 'Madison', 'Madison', 'Madison', 'Madison', 'Madison', 'Hamilton', 'Madison']
['Hamilton', 'Hamilton', 'Hamilton', 'Hamilton', 'Madison', 'Madison', 'Madison', 'Hamilton', 'Hamilton', 'Hamilton', 'Madison', 'Madison']
['Madison', 'Madison', 'Madison', 'Madison', 'Madison', 'Hamilton', 'Madison', 'Madison', 'Madison', 'Madison', 'Madison', 'Madison']


In [33]:
authorship_assignments = pd.DataFrame(np.array(all_results).T, columns=['Lexical Features','Lexical Features - Punctuation','Syntactic Features'])
authorship_assignments.index = ['paper '+str(name.split('.')[0]) for name in disputed_papers_file_names]
authorship_assignments

Unnamed: 0,Lexical Features,Lexical Features - Punctuation,Syntactic Features
paper 49,Hamilton,Hamilton,Madison
paper 50,Hamilton,Hamilton,Madison
paper 51,Madison,Hamilton,Madison
paper 52,Madison,Hamilton,Madison
paper 53,Madison,Madison,Madison
paper 54,Madison,Madison,Hamilton
paper 55,Madison,Madison,Madison
paper 56,Madison,Hamilton,Madison
paper 57,Madison,Hamilton,Madison
paper 58,Madison,Hamilton,Madison
