In [2]:
# Imports
from pickle import dump, load
import nltk
from nltk import word_tokenize,FreqDist
import re
from nltk.corpus import wordnet as wn

In [3]:
doc_list =load(open("data/Proxima_doc_content",'rb'))
len(doc_list)

6

In [4]:
# Start working on one document and associated comments
document = doc_list[0]
document.keys()

dict_keys(['comment_list', 'text'])

In [6]:
# Convert bs4 ResultSet to a list of strings
comments = []
for c in document['comment_list']:
    c = c.replace('\n',' ')
    comments.append(str(c))

In [8]:
comments[:2]

['    The idea of mountain biking in the CVNP  is very appealing. I visit the park to ski, hike, ride the towpath and train on the  hills that lead in and out of the park. So when I want to ride my mountain bike I have to go to the Cleveland Metroparks or other venues. The CNVP is a place to be active and enjoy the outdoors at the same time! From seeing the number of people that ride mountain bikes the visitation numbers would increase and bring in new people that have not been to the CNVP!',
 '     Alternative 5 proposes a limited off road, single-track bike trail of approximately 10 miles along the eastern rim of the Cuyahoga Valley National Park.   I think the CVNP should limit single-track bike trails within its federal boundaries to this east rim.  The plan suggests possible future off road bicycle development along the High Meadow/Buckeye trail area of the CVNP.   If the Cleveland Metroparks decides to put in a bike trail in the more remote southern section of the Brecksville Res

In [9]:
# Modified from Brandon Rose:
def tokenize_text(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

def stem_text(text):
    tokens = tokenize_text(text)
    stems = [stemmer.stem(t) for t in tokens]
    return stems

In [12]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer(tokenizer=stem_text,
                            stop_words='english',
                            ngram_range=(1,3),
                            min_df=0.2, max_df=0.8,
                            max_features=200000)

In [14]:
tfidf_matrix = tfidf_vec.fit_transform(comments)

### K Means

In [15]:
from sklearn.cluster import KMeans

num_clusters = 12
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [18]:
import pandas as pd
comment_clusters = {'comment': comments, 'cluster': clusters}
comment_frame = pd.DataFrame(comment_clusters, index = [clusters] , columns = ['cluster', 'comment'])

In [19]:
comment_frame['cluster'].value_counts()

6     52
4     30
1     30
2     29
9     25
5     24
7     23
10    20
3     20
8     19
11    16
0     16
Name: cluster, dtype: int64

In [25]:
# Modified from Brandon Rose and
# http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html
def vocabulary_frame(text):
    tokens = tokenize_text(text)
    stems = stem_text(text)
    return pd.DataFrame({'words': tokens}, index = stems).drop_duplicates()

def extended_vocabulary_frame(texts):
    frames = []
    for t in texts:
        vf = vocabulary_frame(t)
        frames.append(vf)
    extended = pd.concat(frames).drop_duplicates()
    return extended

def km_print_top_words(model, num_clusters, vocab_frame, feature_names, n_top_words):    
    print("Top terms per cluster:\n")
    order_centroids = model.cluster_centers_.argsort()[:, ::-1] 
    for i in range(num_clusters):
        print("Cluster %d Words:" % i, end=' ')
        print(', '.join(vocab_frame.ix[feature_names[ind].split(' ')].values.tolist()[0][0]
                        for ind in order_centroids[i, :n_top_words]))
        print()

In [26]:
feature_names = tfidf_vec.get_feature_names()
comment_vf = extended_vocabulary_frame(comments)
km_print_top_words(km, num_clusters, comment_vf, feature_names, 6)

Top terms per cluster:

Cluster 0 Words: great, park, biking, people, use, natural

Cluster 1 Words: cuyahoga, valley, cuyahoga, park, national, national

Cluster 2 Words: area, biking, mountain, park, mountain, biking

Cluster 3 Words: new, support, bicycle, park, area, enjoy

Cluster 4 Words: ride, biking, mountain, mountain, cvnp, park

Cluster 5 Words: cvnp, use, biking, mountain, allow, mountain

Cluster 6 Words: mountain, mountain, mountain, biking, biking, park

Cluster 7 Words: park, biking, mountain, mountain, national, national

Cluster 8 Words: bikers, mountain, mountain, biking, mountain, park

Cluster 9 Words: use, park, bicycle, natural, new, biking

Cluster 10 Words: active, bicycle, national, national, park, communities

Cluster 11 Words: biking, ohio, mountain, mountain, support, park



### LDA

In [27]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation()
lda.fit(tfidf_matrix)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=10, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [28]:
# Modified from:
# http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html
def lda_print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

In [30]:
lda_print_top_words(lda, feature_names, 20)

Topic 0:
activ, bicycl, biker, mountain biker, communiti, park, mountain, nation park, nation, maintain, great, natur, support, cuyahoga valley, cuyahoga, valley, mountain bike, allow, bike, new

Topic 1:
area, new, park, bike, ohio, support, communiti, nation, use, mountain bike, mountain, bike trail, bicycl, maintain, nation park, great, mountain bike trail, cuyahoga, activ, valley

Topic 2:
bike, ride, mountain bike, mountain, bike trail, mountain bike trail, cvnp, park, area, peopl, ohio, great, enjoy, valley, communiti, support, maintain, activ, natur, use

Topic 3:
park, cuyahoga, valley, nation, nation park, cuyahoga valley, bike, mountain bike, mountain, allow, area, bike trail, use, mountain bike trail, ohio, new, support, great, bicycl, enjoy

Topic 4:
mountain, bicycl, activ, natur, park, mountain bike, communiti, ride, bike, mountain biker, cvnp, new, mountain bike trail, bike trail, allow, use, biker, area, great, enjoy

Topic 5:
mountain, bike, cvnp, park, mountain bike, 