In [2]:
import os, codecs
import gensim
from gensim import corpora
from collections import defaultdict
import string
from string import punctuation
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import pandas as pd
import numpy as np

Using TensorFlow backend.


### THESE SHOULD BE ALL THE RELATIVE PATHS

In [3]:
source_dir = '/Users/rachelbrynsvold/dsi/capstone_dir/Capstone/books/clean' + '/'
outputs_dir = '/Users/rachelbrynsvold/dsi/capstone_dir/Capstone/outputs' + '/'

In [4]:
class IterFile(object):
    '''
    class object to do the iterating on individual book txt documents, including file i/o.
    '''
    
    def __init__(self, filepath):
        self.filepath = filepath
        
    def _open_file(self):
        self.file = codecs.open(self.filepath, 'r', encoding='utf_8')
        
    def _close_file(self):
        self.file.close()
    
    def __iter__(self):
        '''
        overwrite iteration to include file i/o
        '''
        self._open_file()
        
        for line in self.file:
            yield line
        
        self._close_file()
    

In [4]:
def transform_txt_file_v1(fname, root=source_dir):
    '''
    Initial pass at text transformation
    Reimplemented later (v2 etc) as a caller of various subfunctions to do all the transformation
    '''
    fp = root + fname

    book_as_lst = []
    for line in IterFile(fp):
        if line == "\n":
            pass
        else: 
            line_lst= [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]
            book_as_lst.extend(line_lst)
            
    #add in stop word removal and frequency threshhold
    return book_as_lst

In [5]:
temp_corp = PlaintextCorpusReader(source_dir, '.*\.txt')
fileid_lst = temp_corp.fileids()
fileid_lst

['10-clean.txt',
 '100-clean.txt',
 '105-clean.txt',
 '108-clean.txt',
 '1080-clean.txt',
 '11-clean.txt',
 '1112-clean.txt',
 '1184-clean.txt',
 '12-clean.txt',
 '120-clean.txt',
 '1232-clean.txt',
 '1260-clean.txt',
 '1322-clean.txt',
 '1342-clean.txt',
 '135-clean.txt',
 '1399-clean.txt',
 '140-clean.txt',
 '1400-clean.txt',
 '1404-clean.txt',
 '14264-clean.txt',
 '147-clean.txt',
 '1497-clean.txt',
 '15399-clean.txt',
 '158-clean.txt',
 '16-clean.txt',
 '160-clean.txt',
 '161-clean.txt',
 '16382-clean.txt',
 '1661-clean.txt',
 '1727-clean.txt',
 '174-clean.txt',
 '1952-yellow_wallpaper-clean.txt',
 '19942-clean.txt',
 '20-clean.txt',
 '20203-clean.txt',
 '203-clean.txt',
 '205-clean.txt',
 '21279-clean.txt',
 '2148-clean.txt',
 '2174-clean.txt',
 '219-clean.txt',
 '224-clean.txt',
 '23-clean.txt',
 '236-clean.txt',
 '2500-clean.txt',
 '25305-clean.txt',
 '2591-clean.txt',
 '2600-clean.txt',
 '2680-clean.txt',
 '2701-moby-clean.txt',
 '28054-clean.txt',
 '2814-clean.txt',
 '2852-cle

## EDA and Saving helper funtions

To automate EDA steps

In [6]:
def eda(transform_txt_file, fileid_lst=fileid_lst):
    '''
    Do transformations with updated transformation function and return all the eda items
    '''
    
    all_transf_books_lst = [transform_txt_file(f) for f in fileid_lst]
    
    book_lengths = [(tup[0], len(tup[1])) for tup in zip(fileid_lst, all_transf_books_lst)]
    avg_num_tokens = int(np.mean([len(book) for book in all_transf_books_lst]))
    
    dictionary = corpora.Dictionary(all_transf_books_lst)
    dictionary_length = len(dictionary)
    
    corpus = [dictionary.doc2bow(book) for book in all_transf_books_lst]
    
    unique_toks_num_lst = [len(book) for book in corpus]
    unique_toks_per_fileid = zip(fileid_lst, unique_toks_num_lst)
    avg_unique_toks = int(np.mean(unique_toks_num_lst))
    
    
    return book_lengths, avg_num_tokens, dictionary, dictionary_length, unique_toks_per_fileid, avg_unique_toks, corpus


def save_stuff(distinguishing_str, dictionary, corpus, outputs_dir=outputs_dir):
    '''
    Save the outputs of the most recent eda step
    '''
    dictionary.save(outputs_dir + distinguishing_str + '.dict')
    corpora.MmCorpus.serialize(outputs_dir + distinguishing_str + '_corpus.mm', corpus)
    

### EDA items
* List of book lengths (total num of tokens for each book)
* Average number of tokens per book
* Number of words in corpus (dictionary length)
    * Dictionary (not viewed)
* Unique tokens per book
* Average number of unique tokens per book
    * Corpus (not viewe)
    
Save everything after eda step


### To summarize the 'simple tokenization' EDA step (#1):

In [7]:
output_v1 = eda(transform_txt_file_v1)

In [8]:
book_lengths1, avg_num_tokens1, dictionary1, dictionary_length1, unique_toks_per_fileid1, \
    avg_unique_toks1, corpus1 = output_v1 

In [9]:
print "Average number of tokens in a book: ", avg_num_tokens1
print "   "
print "Average unique tokens in a book: ", avg_unique_toks1
print "   "
print "Total number of words (dictionary length): ", dictionary_length1

Average number of tokens in a book:  132124
   
Average unique tokens in a book:  9004
   
Total number of words (dictionary length):  195104


In [10]:
##for pres, note the sparcity problem - 9000 vs. 195k == 186k empty

In [11]:
save_stuff('simple_tok', dictionary1, corpus1)

## EDA Step 2: + stop word removal

In [12]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = set(stopwords.words('english'))
print stop

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachelbrynsvold/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
set([u'all', u'just', u'being', u'over', u'both', u'through', u'yourselves', u'its', u'before', u'o', u'hadn', u'herself', u'll', u'had', u'should', u'to', u'only', u'won', u'under', u'ours', u'has', u'do', u'them', u'his', u'very', u'they', u'not', u'during', u'now', u'him', u'nor', u'd', u'did', u'didn', u'this', u'she', u'each', u'further', u'where', u'few', u'because', u'doing', u'some', u'hasn', u'are', u'our', u'ourselves', u'out', u'what', u'for', u'while', u're', u'does', u'above', u'between', u'mustn', u't', u'be', u'we', u'who', u'were', u'here', u'shouldn', u'hers', u'by', u'on', u'about', u'couldn', u'of', u'against', u's', u'isn', u'or', u'own', u'into', u'yourself', u'down', u'mightn', u'wasn', u'your', u'from', u'her', u'their', u'aren', u'there', u'been', u'whom', u'too', u'wouldn', u'themselves', u'weren', u'was', u'unt

In [43]:
def transform_txt_file_v2(fname, root=source_dir, stop_words=stop):
    '''
    Top-level function to call all of the subfunctions for text transformation
    Assumes you want to remove empty lines and tokenize (because you do)
    '''
    fp = root + fname
    book_as_lst = []
    for line in IterFile(fp):
        
        if empty_line_check(line) == False:
            line = basic_tokenize(line)
            
            if stop_words !=None:
                line = remove_stop_words(line, stop_words)
        
            book_as_lst.extend(line)
        
    return book_as_lst

def empty_line_check(line) :
    '''
    checks for empty line
    '''
    if line == "\n":
        empty = True
    else:
        empty = False
    return empty
    
def basic_tokenize(line):
    '''
    convert to list
    strip punctuation, lowercase
    '''
    return [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]    
            
def remove_stop_words(line, stop_words):
    return [tok for tok in line if tok not in stop_words] 

In [14]:
output_v2 = eda(transform_txt_file_v2)

In [15]:
book_lengths2, avg_num_tokens2, dictionary2, dictionary_length2, \
    unique_toks_per_fileid2, avg_unique_toks2, corpus2 = output_v2

In [16]:
print "Average number of tokens in a book: ", avg_num_tokens2
print "   "
print "Average unique tokens in a book: ", avg_unique_toks2
print "   "
print "Total number of words (dictionary length): ", dictionary_length2

Average number of tokens in a book:  66904
   
Average unique tokens in a book:  8882
   
Total number of words (dictionary length):  194961


In [17]:
#make graph of reduction

In [18]:
save_stuff('no_stopwords', dictionary2, corpus2)

## EDA Step 4: + frequency filters

In [19]:
def eda_w_filter(transform_txt_file, fileid_lst=fileid_lst):
    '''
    Do transformations with updated transformation function and return all the eda items
    '''
    
    all_transf_books_lst = [transform_txt_file(f) for f in fileid_lst]
    
    book_lengths = [(tup[0], len(tup[1])) for tup in zip(fileid_lst, all_transf_books_lst)]
    avg_num_tokens = int(np.mean([len(book) for book in all_transf_books_lst]))
    
    dictionary = corpora.Dictionary(all_transf_books_lst)
    dictionary.filter_extremes(no_below=1)
    dictionary_length = len(dictionary)
    
    corpus = [dictionary.doc2bow(book) for book in all_transf_books_lst]
    
    unique_toks_num_lst = [len(book) for book in corpus]
    unique_toks_per_fileid = zip(fileid_lst, unique_toks_num_lst)
    avg_unique_toks = int(np.mean(unique_toks_num_lst))
    
    
    return book_lengths, avg_num_tokens, dictionary, dictionary_length, unique_toks_per_fileid, avg_unique_toks, corpus


def save_stuff(distinguishing_str, dictionary, corpus, model, outputs_dir=outputs_dir):
    '''
    Save the outputs of the most recent eda step
    '''
    if dictionary != None:
        dictionary.save(outputs_dir + distinguishing_str + '.dict')
        
    if corpus != None:
        corpora.MmCorpus.serialize(outputs_dir + distinguishing_str + '_corpus.mm', corpus)
    
    if model != None:
        pass

In [20]:
outputs4 = eda_w_filter(transform_txt_file_v2)

In [21]:
book_lengths4, avg_num_tokens4, dictionary4, dictionary_length4, \
    unique_toks_per_fileid4, avg_unique_toks4, corpus4 = outputs4

In [22]:
print "Average number of tokens in a book: ", avg_num_tokens4, avg_num_tokens2
print "   "
print "Average unique tokens in a book: ", avg_unique_toks4, avg_unique_toks2
print "   "
print "Total number of words (dictionary length): ", dictionary_length4, dictionary_length2

Average number of tokens in a book:  66904 66904
   
Average unique tokens in a book:  5461 8882
   
Total number of words (dictionary length):  100000 194961


In [23]:
save_stuff('frequency_filtered', dictionary4, corpus4, model=None)

## Dimensionality Reduction Summary:

In [24]:
book_lengths4, avg_num_tokens4, dictionary4, dictionary_length4, \
    unique_toks_per_fileid4, avg_unique_toks4, corpus4 = outputs4

In [30]:
print "Average total words per book: "
print "   ", "Initial (tokenized): ", avg_num_tokens1
print "   ", "Stop words removed: ", avg_num_tokens2
print "   ", "Frequency filtered: ", avg_num_tokens4
print "   "
print "Average unique words per book: "
print "   ", "Initial (tokenized): ", avg_unique_toks1
print "   ", "Stop words removed: ", avg_unique_toks2
print "   ", "Frequency filtered: ", avg_unique_toks4
print "   "
print "Vocabulary length: "
print "   ", "Initial (tokenized): ", dictionary_length1
print "   ", "Stop words removed: ", dictionary_length2
print "   ", "Frequency filtered: ", dictionary_length4


Average total words per book: 
    Initial (tokenized):  132124
    Stop words removed:  66904
    Frequency filtered:  66904
   
Average unique words per book: 
    Initial (tokenized):  9004
    Stop words removed:  8882
    Frequency filtered:  5461
   
Vocabulary length: 
    Initial (tokenized):  195104
    Stop words removed:  194961
    Frequency filtered:  100000


In [26]:
avg_num_tokens_lst = [avg_num_tokens1, avg_num_tokens2, avg_num_tokens4]
avg_unique_toks_lst =  [avg_unique_toks1, avg_unique_toks2, avg_unique_toks4]
vocab_size_lst =[dictionary_length1, dictionary_length2, dictionary_length4]
avg_num_tokens_lst, avg_unique_toks_lst, vocab_size_lst

([132124, 66904, 66904], [9004, 8882, 5461], [195104, 194961, 100000])

In [27]:
tokenized_lst = [avg_unique_toks1, avg_num_tokens1, dictionary_length1]
stop_words_removed_lst =  [avg_unique_toks2, avg_num_tokens2, dictionary_length2]
frequency_filtered_lst =[avg_unique_toks4, avg_num_tokens4, dictionary_length4]
tokenized_lst, stop_words_removed_lst, frequency_filtered_lst

([9004, 132124, 195104], [8882, 66904, 194961], [5461, 66904, 100000])

In [28]:
import matplotlib.pyplot as plt
#%matplotinline

In [29]:
n_groups = 3
pos = list(range(n_groups))

dim_red_1 = tokenized_lst

dim_red_2 = stop_words_removed_lst

dim_red_3 = frequency_filtered_lst


fig, ax = plt.subplots()

pos = list(range(n_groups))
bar_width = 0.25

opacity = 0.4
error_config = {'ecolor': '0.3'}

rects1 = plt.bar(pos, 
                 dim_red_1, bar_width,
                 alpha=opacity,
                 #color='b',
                 label='Initial')

rects2 = plt.bar([p + bar_width for p in pos], 
                 dim_red_2, bar_width,
                 alpha=opacity,
                 #color='b',
                 label='Stop Word Removal')

rects3 = plt.bar([p + bar_width*2 for p in pos], 
                 dim_red_3, bar_width,
                 alpha=opacity,
                 #color='b',
                 label= 'Frequency Filtering')

#plt.xlabel('Group')
plt.ylabel('Number of Words')
plt.title('Counts Over Dimensionality Reduction')
plt.xticks(index + bar_width / 2, ('Avg unique per book', \
                        'Avg total per book', 'Total vocab size'))
plt.legend()

plt.tight_layout()
plt.show()

NameError: name 'index' is not defined

In [33]:
all_transf_books_lst = [transform_txt_file_v2(f) for f in fileid_lst]
all_tokens = sum(all_transf_books_lst, [])
all_tokens

[u'\n',
 u'\n',
 u'\n',
 u'\n',
 u'\n',
 u'\n',
 u'\n',
 u'\n',
 u'\n',
 u'\n',
 u'\n',
 u'\n',
 u'\n',
 u'\n',
 u'old',
 u'testament',
 u'king',
 u'james',
 u'version',
 u'bible',
 u'\n',
 u'\n',
 u'\n',
 u'\n',
 u'first',
 u'book',
 u'moses',
 u'called',
 u'genesis',
 u'\n',
 u'\n',
 u'1:1',
 u'beginning',
 u'god',
 u'created',
 u'heavens',
 u'earth',
 u'\n',
 u'1:2',
 u'earth',
 u'without',
 u'form',
 u'void',
 u'darkness',
 u'upon',
 u'face',
 u'deep',
 u'spirit',
 u'god',
 u'moved',
 u'upon',
 u'face',
 u'waters',
 u'\n',
 u'1:3',
 u'god',
 u'said',
 u'let',
 u'light',
 u'light',
 u'\n',
 u'1:4',
 u'god',
 u'saw',
 u'light',
 u'good',
 u'god',
 u'divided',
 u'light',
 u'darkness',
 u'\n',
 u'1:5',
 u'god',
 u'called',
 u'light',
 u'day',
 u'darkness',
 u'called',
 u'night',
 u'evening',
 u'morning',
 u'first',
 u'day',
 u'\n',
 u'1:6',
 u'god',
 u'said',
 u'let',
 u'firmament',
 u'midst',
 u'waters',
 u'let',
 u'divide',
 u'waters',
 u'waters',
 u'\n',
 u'1:7',
 u'god',
 u'made',


In [35]:
all_tokens_set = set(all_tokens)

In [37]:
tokens_once = set([word for word in all_tokens_set if all_tokens.count(word) == 1])
tokens_once
#texts = [[word for word in text if word not in tokens_once] for text in texts]

KeyboardInterrupt: 

In [44]:
b_l = transform_txt_file_v2(f)
b_l

[u'quixote',
 u'complete',
 u'miguel',
 u'de',
 u'cervantes',
 u'saavedra',
 u'translated',
 u'john',
 u'ormsby',
 u'contents',
 u'volume',
 u'chapter',
 u'treats',
 u'character',
 u'pursuits',
 u'famous',
 u'gentleman',
 u'quixote',
 u'la',
 u'mancha',
 u'chapter',
 u'ii',
 u'treats',
 u'first',
 u'sally',
 u'ingenious',
 u'quixote',
 u'made',
 u'home',
 u'chapter',
 u'iii',
 u'wherein',
 u'related',
 u'droll',
 u'way',
 u'quixote',
 u'dubbed',
 u'knight',
 u'chapter',
 u'iv',
 u'happened',
 u'knight',
 u'left',
 u'inn',
 u'chapter',
 u'v',
 u'narrative',
 u"knight's",
 u'mishap',
 u'continued',
 u'chapter',
 u'vi',
 u'diverting',
 u'important',
 u'scrutiny',
 u'curate',
 u'barber',
 u'made',
 u'library',
 u'ingenious',
 u'gentleman',
 u'chapter',
 u'vii',
 u'second',
 u'sally',
 u'worthy',
 u'knight',
 u'quixote',
 u'la',
 u'mancha',
 u'chapter',
 u'viii',
 u'good',
 u'fortune',
 u'valiant',
 u'quixote',
 u'terrible',
 u'undreamt-of',
 u'adventure',
 u'windmills',
 u'occurrences',
 u

In [None]:
#this goes to slow - see this post for frequency filtering (if time):
#https://stackoverflow.com/questions/24688116/how-to-filter-out-words-with-low-tf-idf-in-a-corpus-with-gensim

In [49]:
all_tokens_set = set(b_l)
%time
print "tokens set done"
tokens_once = set(word for word in all_tokens_set if b_l.count(word) <= 1)
%time
print "part two done"
print [word for word in b_l if word not in tokens_once]
%time

CPU times: user 20 µs, sys: 15 µs, total: 35 µs
Wall time: 64.1 µs
tokens set done


KeyboardInterrupt: 

## LDA Model

In [50]:
from gensim.models import ldamodel
from gensim.models import LdaMulticore

In [51]:
lda = ldamodel.LdaModel(corpus=corpus4,alpha='auto', id2word=dictionary2, \
                        num_topics=100, update_every=0, passes=20)
%time

KeyboardInterrupt: 

In [None]:
name = 'lda.model'
lda.save(outputs_dir + name)


In [None]:
print lda

In [54]:
lda_multi = LdaMulticore(corpus=corpus4, id2word=dictionary4, \
                        num_topics=100, passes=20, workers=4)
%time

Process PoolWorker-4:
  File "/Users/rachelbrynsvold/anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
Process PoolWorker-1:
Process PoolWorker-2:
Process PoolWorker-5:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/rachelbrynsvold/anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/Users/rachelbrynsvold/anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
Traceback (most recent call last):
  File "/Users/rachelbrynsvold/anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
    self.run()
    self.run()
  File "/Users/rachelbrynsvold/anaconda/lib/python2.7/multiprocessing/process.py", line 114, in run
    self.run()
  File "/Users/rachelbrynsvold/anaconda/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/Users/rachelbrynsvold/anaconda/lib/python2.7/multiprocessing/process.py", l

KeyboardInterrupt: 

In [None]:
name = 'lda_multi.model'
lda_multi.save(outputs_dir + name)


In [None]:
print lda_multi

See the topics and their most significant terms

In [None]:
lda.show_topics()
#also lda.show_topics() - what is difference?

For a given document (in bow format), see most relevant topics

In [None]:
lda.get_document_topics(dictionary2.doc2bow(transform_txt_file_v2('1080-clean.txt')))

For a given term in vocab, see what topics are most likely/relevant

In [None]:
lda.get_term_topics(100, minimum_probability=0.00001)

Inverse of above

In [None]:
lda.get_topic_terms(1)

In [None]:
for tup in lda.get_topic_terms(0):
    print dictionary2[tup[0]]

In [None]:
#Document alignment
#softmax - zeroing - zero out all not in the top2 alignment
    #manually creat label from top word
    #need to include logic to use two topics if below certain threshhold
#projection - 
#cos sim