In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import nltk
import numpy as np
import re

with open('clustering_pg18.txt', 'r') as fin:
    lines = fin.read()

papers = re.split(r'FEDERALIST\.? No\.?', lines)

# Sanity
print(f'N papers= {len(papers)}')
print(papers[3][0:500])

N papers= 87
 3


The Same Subject Continued

(Concerning Dangers From Foreign Force and Influence)

For the Independent Journal.



JAY



To the People of the State of New York:

IT IS not a new observation that the people of any country (if,
like the Americans, intelligent and wellinformed) seldom adopt and
steadily persevere for many years in an erroneous opinion respecting
their interests. That consideration naturally tends to create great
respect for the high opinion which the people of America have so


In [7]:
# Label the authors
catAuthor = {'HAMILTON':0, 'JAY':0, 'MADISON':2}
catAuthorLkup = {catAuthor[k]:k for k in catAuthor}

# Prepare dataset for tf-idf
Docs = papers[1:86]
Authors = np.array([catAuthor[re.search(r'(HAMILTON|JAY|MADISON)', _).group(0)] for _ in Docs])

# Sanity check
print(Authors)

[0 0 0 0 0 0 0 0 0 2 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2
 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0]


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_tfidf = TfidfVectorizer().fit_transform(Docs)

print(f'N data points= {X_tfidf.shape[0]}, M features= {X_tfidf.shape[1]}')

N data points= 85, M features= 8693


In [11]:
from sklearn.cluster import KMeans

Clusters = KMeans(n_clusters=2).fit_predict(X_tfidf.todense())

In [12]:
# Print the cluster IDs that each document is assigned to by kmeans
print(Clusters)

[1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1]


In [13]:
from sklearn.metrics.cluster import contingency_matrix

cm = np.argmax(contingency_matrix(Clusters, Authors), axis=1)

print(f'Contingency: {cm}')

Contingency: [0 0]


In [14]:
# Mapping
for cluster, author in enumerate(cm):
    print(f'cluster id {cluster} matches with Authors {author} - {catAuthorLkup[author]}')

# Map the Clusters
Predicted = np.array([cm[c] for c in Clusters])
print(Predicted)

cluster id 0 matches with Authors 0 - JAY
cluster id 1 matches with Authors 0 - JAY
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0]


In [15]:
# Accuracy
acc = 0
for i, c in enumerate(Clusters):
    if cm[c] == Authors[i]:  # this part is tricky
        acc += 1

print(f'Clustering accuracy= {acc/len(Clusters):.3f}')

Clustering accuracy= 0.824


In [16]:
# Rand index
from sklearn.metrics import adjusted_rand_score

print (f'Adjusted Rand index= {adjusted_rand_score(Authors, Clusters):.3f}')

Adjusted Rand index= -0.060
