Now let's bang out a processing routine for whole datafiles, a matrix analysis routine, and then dump them into scripts and get the Google instance grinding on a large sample.

In [1]:
import pickle
import datetime
import pandas as pd

In [2]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [3]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
from collections import Counter

In [5]:
post_dicts = []
for i in range(2000,2011):
    with open('postdata/jf_f12'+str(i)+'.pkl','rb') as cellar:
        post_dicts.extend(pickle.load(cellar))

In [6]:
len(post_dicts)

1150

In [7]:
post_text = []
for entry in post_dicts:
    post_text.append(entry['post'])

In [8]:
post_ids = []
for entry in post_dicts:
    post_ids.append(entry['postid'])

In [None]:
parsed_posts = []
for doc in nlp.pipe(post_text, disable=["ner"]):
    doclist = []
    for token in doc:
        if (token.is_stop==False) & (token.pos_ != 'PUNCT') & (token.like_num==False):
            doclist.append(token.lemma_)
    parsed_posts.append(' '.join(doclist))

In [None]:
len(parsed_posts)

In [None]:
vectorizer = CountVectorizer(stop_words='english')
doc_word_matrix = vectorizer.fit_transform(parsed_posts)
doc_word_matrix.shape

In [None]:
pd.DataFrame(doc_word_matrix.toarray(), 
             index=post_ids, columns=vectorizer.get_feature_names()).head(10)

In [None]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
# Singular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa = TruncatedSVD(3)
doc_topic_matrix = lsa.fit_transform(doc_word_matrix)
lsa.explained_variance_ratio_

In [None]:
topic_word_matrix = pd.DataFrame(lsa.components_.round(3),
             index = ['c_'+str(i) for i in range(3)],
             columns = vectorizer.get_feature_names())
topic_word_matrix

In [None]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
def output_topics(model, feature_names, no_top_words, topic_names=None):
    topic_dict = {}
    for ix, topic in enumerate(model.components_):
        word_list = [feature_names[i] for i in topic.argsort()[:-no_top_words-1:-1]]
        if not topic_names or not topic_names[ix]:
            topic_dict[ix]=word_list
        else:
            topic_dict[topic_names[ix]]=word_list
    return topic_dict

In [None]:
display_topics(lsa, vectorizer.get_feature_names(), 20)

In [None]:
def analyze_range(start_loc=0,num_files=10,topics=3,terms=20):
    post_dicts = []
    for i in range(start_loc,start_loc+num_files):
        filename = 'postdata/jf_f12'+str(i)+'.pkl' 
        #print('Opening file',filename)
        with open(filename,'rb') as cellar:
            post_dicts.extend(pickle.load(cellar))
    post_text = []
    for entry in post_dicts:
        post_text.append(entry['post'])
    #post_ids = []
    #for entry in post_dicts:
    #    post_ids.append(entry['postid'])
    parsed_posts = []
    for doc in nlp.pipe(post_text, disable=["ner"]):
        doclist = []
        for token in doc:
            if (token.is_stop==False) & (token.pos_ != 'PUNCT') & (token.like_num==False):
                doclist.append(token.lemma_)
        parsed_posts.append(' '.join(doclist))
    vectorizer = CountVectorizer(stop_words='english')
    doc_word_matrix = vectorizer.fit_transform(parsed_posts)
    lsa = TruncatedSVD(topics)
    doc_topic_matrix = lsa.fit_transform(doc_word_matrix)
    print("Topic ratios: ",lsa.explained_variance_ratio_)
    return output_topics(lsa, vectorizer.get_feature_names(), terms)

In [None]:
def display_analyze_range(start_loc=0,num_files=10,topics=3,terms=20):
    post_dicts = []
    for i in range(start_loc,start_loc+num_files):
        filename = 'postdata/jf_f12'+str(i)+'.pkl' 
        #print('Opening file',filename)
        with open(filename,'rb') as cellar:
            post_dicts.extend(pickle.load(cellar))
    post_text = []
    for entry in post_dicts:
        post_text.append(entry['post'])
    #post_ids = []
    #for entry in post_dicts:
    #    post_ids.append(entry['postid'])
    parsed_posts = []
    for doc in nlp.pipe(post_text, disable=["ner"]):
        doclist = []
        for token in doc:
            if (token.is_stop==False) & (token.pos_ != 'PUNCT') & (token.like_num==False):
                doclist.append(token.lemma_)
        parsed_posts.append(' '.join(doclist))
    vectorizer = CountVectorizer(stop_words='english')
    doc_word_matrix = vectorizer.fit_transform(parsed_posts)
    lsa = TruncatedSVD(topics)
    doc_topic_matrix = lsa.fit_transform(doc_word_matrix)
    print("Topic ratios: ",lsa.explained_variance_ratio_)
    display_topics(lsa, vectorizer.get_feature_names(), terms)
    return None

In [None]:
for j in range(5):
    start_record = j*350
    display_analyze_range(start_loc=start_record, num_files=50, topics=4, terms=15)
    print('\n')

In [None]:
topic_list = []
for j in range(10):
    start_record = j*350
    topic_list.append(analyze_range
                      (start_loc=start_record, num_files=50, topics=4, terms=15))

In [None]:
topic_list[8]

I have not tried analyzing the post data with NNMF yet. Let me try comparing those.

In [None]:
nmf_model = NMF(3)
nmf_doc_topic = nmf_model.fit_transform(doc_word_matrix)
nmf_topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = ['c_'+str(i) for i in range(3)],
             columns = vectorizer.get_feature_names())
nmf_topic_word

In [None]:
display_topics(nmf_model, vectorizer.get_feature_names(), 10)

In [None]:
display_topics(lsa, vectorizer.get_feature_names(), 10)

I don't see a significant difference.

I may be able to implement my own list of stopwords / trashwords. Let me throw absolutely all my parsed & cleaned words into a massive Counter and study them from most common downward.

In [None]:
parsed_post_words = Counter()
for doc in nlp.pipe(post_text, disable=["ner"]):
    doclist = []
    for token in doc:
        if (token.is_stop==False) & (token.pos_ != 'PUNCT') & (token.like_num==False):
            doclist.append(token.lemma_)
    parsed_post_words.update(doclist)

In [None]:
parsed_post_words['jeep']

In [None]:
parsed_post_words.most_common(150)

In [9]:
nlp.Defaults.stop_words |= {"like","post","get","look","go","not","good","need","think","quote",
            "originally","know","jeep","try","way","want","thing","$","yj","-",
            "sure","say","bad","/","take","Jeep","u","great","well","tell","be","lot",
            "have","fine","s","yeah","nice",}

In [34]:
stoplist = list(nlp.Defaults.stop_words)

In [35]:
vect_feed = []
for doc in nlp.pipe(post_text, disable=["ner"]):
    doclist = []
    for token in doc:
        if token.is_stop:
            continue
        elif token.pos_ == 'PUNCT':
            continue
        elif token.like_num:
            continue
        else:
            doclist.append(str(token.lemma_))
    vect_feed.append(' '.join(doclist))
cvect = CountVectorizer(stop_words=stoplist)
doc_word_clean = cvect.fit_transform(vect_feed)
doc_word_clean.shape

  'stop_words.' % sorted(inconsistent))


(1150, 4161)

In [38]:
'jeep' in cvect.get_feature_names()

False

In [23]:
doc = nlp(post_text[0])

In [24]:
for token in :
    if token.is_stop == True:
        print("Stopword:",token.lemma_)
    elif token.pos_ == 'PUNCT':
        print("Punctuation:",token.lemma_)
    elif token.like_num == True:
        print("Number:",token.lemma_)
    else:
        print("Valuable word:",token.lemma_)

Valuable word: alright
Punctuation: ,
Stopword: -PRON-
Valuable word: be
Stopword: at
Stopword: the
Valuable word: point
Stopword: be
Stopword: -PRON-
Valuable word: be
Valuable word: get
Valuable word: sick
Stopword: of
Stopword: this
Stopword: thing
Valuable word: run
Stopword: like
Valuable word: crap
Punctuation: ....
Stopword: how
Stopword: should
Valuable word: stuff
Stopword: be
Valuable word: hook
Stopword: up
Punctuation: ?
Stopword: -PRON-
Valuable word: run
Stopword: and
Stopword: -PRON-
Valuable word: drive
Stopword: but
Stopword: just
Stopword: not
Stopword: how
Stopword: i
Stopword: want
Punctuation: ...
Stopword: the
Valuable word: ignition
Valuable word: module
Stopword: be
Valuable word: disconnected
Punctuation: ...
Stopword: what
Stopword: else
Stopword: should
Stopword: i
Stopword: do
Punctuation: ?


Ugly confusing warning or not, sklearn, at least you cut out some (some!!!) of the stop words I told you to, which is absurdly more than I can say for spaCy.

Tomorrow I will have to resume with checking the results of LSA/SVD, NNMF against the results before bonus trash word removal, see if I accomplished anything with all this pain.