In [1]:
# Imports
from pickle import dump, load
import nltk
from nltk import word_tokenize,FreqDist
import re
from nltk.corpus import wordnet as wn

In [2]:
doc_list =load(open("data/Jason_doc_content",'rb'))
len(doc_list)

7

In [3]:
# Start working on one document and associated comments
document = doc_list[0]
document.keys()

dict_keys(['text', 'comment_list'])

In [4]:
# Convert bs4 ResultSet to a list of strings
comments = []
for c in document['comment_list']:
    c = c.replace('\n',' ')
    comments.append(str(c))

In [5]:
len(comments)

420

In [6]:
# Modified from Brandon Rose:
def tokenize_text(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

def stem_text(text):
    tokens = tokenize_text(text)
    stems = [stemmer.stem(t) for t in tokens]
    return stems

In [7]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer(tokenizer=stem_text,
                            stop_words='english',
                            ngram_range=(1,3),
                            min_df=0.2, max_df=0.8,
                            max_features=200000)

In [9]:
tfidf_matrix = tfidf_vec.fit_transform(comments)

### PCA PCA PCA

In [None]:
#implement PCA here

### K Means

In [10]:
from sklearn.cluster import KMeans

num_clusters = 12
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [59]:
from sklearn.metrics.pairwise import paired_distances

cluster_center_list = []
for c in clusters:
    cluster_center_list.append(km.cluster_centers_[c])

center_distances = paired_distances(tfidf_matrix, cluster_center_list)

In [82]:
import pandas as pd
comment_clusters = {'comment': comments, 'cluster': clusters, 'dist': center_distances}
comment_frame = pd.DataFrame(comment_clusters, index = [clusters] , columns = ['comment', 'cluster', 'dist'])

In [83]:
comment_frame['cluster'].value_counts()

2     71
1     66
7     55
6     45
4     41
3     28
10    26
9     25
5     21
0     18
11    14
8     10
Name: cluster, dtype: int64

In [99]:
print(comment_frame[comment_frame.cluster==3].max())
print(comment_frame[comment_frame.cluster==3].min())
print()
print(comment_frame[comment_frame.cluster==3])

comment    tRECLAM.ATIONManaging Water in the WestComment...
cluster                                                    3
dist                                                0.900449
dtype: object
comment    'RECLAMATIONManaging Water in the WestRevised ...
cluster                                                    3
dist                                               0.0864882
dtype: object

                                             comment  cluster      dist
3  Hello:  Please include the attached comments f...        3  0.900449
3  .ATIONManaging Water in the West.9nüèntSheetRe...        3  0.086488
3  RECLAMATIONManaging Water in the West.Jiiiiii1...        3  0.096412
3  -RECLAMATIONManaging Water in the WestRevised ...        3  0.352354
3  'RECLAMATIONManaging Water in the WestRevised ...        3  0.245822
3  C omr.çht S1amiRevised Proposed Rule on the us...        3  0.253019
3  RECLAMATIONManaging Water in the WestComment S...        3  0.086488
3  RECLAMATIONMan aging Wa te

In [84]:
print('Most Central Comments by Cluster\n')
for i in range(num_clusters):
    print('Cluster {}\n'.format(i))
    print(comment_frame[comment_frame.cluster==i].min().comment)
    print()

Most Central Comments by Cluster

Cluster 0

10 September 2008Denver Federal CenterP. 0. Box25007Denver, CO 80225-0007ôeooODear Sirs:Your idea or ruling out being able to transfer a dock permit is very unfair.We have had adock permit ever since there was one required. Our property is on the lake and we willnot be able to sell itforas much if we can't transfer the permit.We are getting old andneed to sell our property soon. Location has always been the important thing in buyingproperty and all of a sudden you are saying that is not allowable. If you are buyingproperty you would certainly buy where the location is the best and expect to pay morefor it.Especially on a lake is this so.Sincerely,#-Y7i)/Arthur and Lynn Norton178 N Adams PlaceNampa, ID83651

Cluster 1

Multiple ˜les are bound together in this PDF Package. Adobe recommends using Adobe Reader or Adobe Acrobat version 8 or later to work with  documents contained within a PDF Package. By updating to the latest version, you™ll enj

In [None]:
# Modified from Brandon Rose and
# http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html
def vocabulary_frame(text):
    tokens = tokenize_text(text)
    stems = stem_text(text)
    return pd.DataFrame({'words': tokens}, index = stems).drop_duplicates()

def extended_vocabulary_frame(texts):
    frames = []
    for t in texts:
        vf = vocabulary_frame(t)
        frames.append(vf)
    extended = pd.concat(frames).drop_duplicates()
    return extended

def km_print_top_words(model, num_clusters, vocab_frame, feature_names, n_top_words):    
    print("Top terms per cluster:\n")
    order_centroids = model.cluster_centers_.argsort()[:, ::-1] 
    for i in range(num_clusters):
        print("Cluster %d Words:" % i, end=' ')
        print(', '.join(vocab_frame.ix[feature_names[ind].split(' ')].values.tolist()[0][0]
                        for ind in order_centroids[i, :n_top_words]))
        print()

In [None]:
feature_names = tfidf_vec.get_feature_names()
comment_vf = extended_vocabulary_frame(comments)
km_print_top_words(km, num_clusters, comment_vf, feature_names, 6)

### LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation()
lda.fit(tfidf_matrix)

In [None]:
# Modified from:
# http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html
def lda_print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

In [None]:
lda_print_top_words(lda, feature_names, 20)