In [1]:
import os, codecs
import gensim
from gensim import corpora
from collections import defaultdict
import string
from string import punctuation
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import pandas as pd
import numpy as np

### THESE SHOULD BE ALL THE RELATIVE PATHS

In [2]:
source_dir = '/home/ubuntu/data_download/clean_books/'

In [3]:
outputs_dir = '/home/ubuntu/Capstone/outputs/full_data/'

In [4]:
class IterFile(object):
    '''
    class object to do the iterating on individual book txt documents, including file i/o.
    '''
    
    def __init__(self, filepath):
        self.filepath = filepath
        
    def _open_file(self):
        self.file = codecs.open(self.filepath, 'r', encoding='utf_8')
        
    def _close_file(self):
        self.file.close()
    
    def __iter__(self):
        '''
        overwrite iteration to include file i/o
        '''
        self._open_file()
        
        for line in self.file:
            yield line
        
        self._close_file()
    

In [5]:
def transform_txt_file_v1(fname, root=source_dir):
    '''
    Initial pass at text transformation
    Reimplemented later (v2 etc) as a caller of various subfunctions to do all the transformation
    '''
    fp = root + fname

    book_as_lst = []
    for line in IterFile(fp):
        if line == "\n":
            pass
        else: 
            line_lst= [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]
            book_as_lst.extend(line_lst)
            
    #add in stop word removal and frequency threshhold
    return book_as_lst

In [6]:
temp_corp = PlaintextCorpusReader(source_dir, '.*\.txt')
fileid_lst = temp_corp.fileids()
fileid_lst

['1.txt',
 '10.txt',
 '100.txt',
 '10000.txt',
 '10001.txt',
 '10002-8.txt',
 '10002.txt',
 '10003-8.txt',
 '10003.txt',
 '10004-8.txt',
 '10004.txt',
 '10005-8.txt',
 '10005.txt',
 '10006.txt',
 '10007.txt',
 '10008.txt',
 '10009.txt',
 '1001.txt',
 '10010.txt',
 '10011.txt',
 '10012.txt',
 '10013.txt',
 '10014.txt',
 '10015.txt',
 '10016.txt',
 '10017.txt',
 '10018.txt',
 '10019.txt',
 '1002.txt',
 '10020.txt',
 '10021.txt',
 '10022.txt',
 '10023.txt',
 '10024.txt',
 '10025.txt',
 '10026.txt',
 '10027.txt',
 '10028.txt',
 '10029.txt',
 '1003.txt',
 '10030.txt',
 '10031.txt',
 '10032.txt',
 '10033.txt',
 '10034.txt',
 '10035.txt',
 '10036.txt',
 '10037.txt',
 '10038.txt',
 '10039.txt',
 '1004.txt',
 '10040.txt',
 '10041.txt',
 '10042.txt',
 '10043.txt',
 '10044.txt',
 '10045.txt',
 '10046.txt',
 '10047.txt',
 '10048.txt',
 '10049.txt',
 '1005.txt',
 '10050.txt',
 '10051.txt',
 '10052.txt',
 '10056.txt',
 '10057.txt',
 '10058.txt',
 '10059.txt',
 '1006.txt',
 '10060.txt',
 '10062.txt',

## EDA and Saving helper funtions

To automate EDA steps

In [None]:
def eda(transform_txt_file, fileid_lst=fileid_lst):
    '''
    Do transformations with updated transformation function and return all the eda items
    '''
    
    all_transf_books_lst = [transform_txt_file(f) for f in fileid_lst]
    
    book_lengths = [(tup[0], len(tup[1])) for tup in zip(fileid_lst, all_transf_books_lst)]
    avg_num_tokens = int(np.mean([len(book) for book in all_transf_books_lst]))
    
    dictionary = corpora.Dictionary(all_transf_books_lst)
    dictionary_length = len(dictionary)
    
    corpus = [dictionary.doc2bow(book) for book in all_transf_books_lst]
    
    unique_toks_num_lst = [len(book) for book in corpus]
    unique_toks_per_fileid = zip(fileid_lst, unique_toks_num_lst)
    avg_unique_toks = int(np.mean(unique_toks_num_lst))
    
    
    return book_lengths, avg_num_tokens, dictionary, dictionary_length, unique_toks_per_fileid, avg_unique_toks, corpus


def save_stuff(distinguishing_str, dictionary, corpus, outputs_dir=outputs_dir):
    '''
    Save the outputs of the most recent eda step
    '''
    dictionary.save(outputs_dir + distinguishing_str + '.dict')
    corpora.MmCorpus.serialize(outputs_dir + distinguishing_str + '_corpus.mm', corpus)
    

### EDA items
* List of book lengths (total num of tokens for each book)
* Average number of tokens per book
* Number of words in corpus (dictionary length)
    * Dictionary (not viewed)
* Unique tokens per book
* Average number of unique tokens per book
    * Corpus (not viewe)
    
Save everything after eda step


### To summarize the 'simple tokenization' EDA step (#1):

In [None]:
output_v1 = eda(transform_txt_file_v1)

In [None]:
book_lengths1, avg_num_tokens1, dictionary1, dictionary_length1, unique_toks_per_fileid1, \
    avg_unique_toks1, corpus1 = output_v1 

In [None]:
print "Average number of tokens in a book: ", avg_num_tokens1
print "   "
print "Average unique tokens in a book: ", avg_unique_toks1
print "   "
print "Total number of words (dictionary length): ", dictionary_length1

In [None]:
##for pres, note the sparcity problem - 9000 vs. 195k == 186k empty

In [None]:
save_stuff('simple_tok', dictionary1, corpus1)

## EDA Step 2: + stop word removal

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = set(stopwords.words('english'))
print stop

In [None]:
def transform_txt_file_v2(fname, root=source_dir, stop_words=stop):
    '''
    Top-level function to call all of the subfunctions for text transformation
    Assumes you want to remove empty lines and tokenize (because you do)
    '''
    fp = root + fname
    book_as_lst = []
    for line in IterFile(fp):
        
        if empty_line_check(line) == False:
            line = basic_tokenize(line)
            
            if stop_words !=None:
                line = remove_stop_words(line, stop_words)
        
        book_as_lst.extend(line)
        
    return book_as_lst

def empty_line_check(line) :
    '''
    checks for empty line
    '''
    if line == "\n":
        empty = True
    else:
        empty = False
    return empty
    
def basic_tokenize(line):
    '''
    convert to list
    strip punctuation, lowercase
    '''
    return [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]    
            
def remove_stop_words(line, stop_words):
    return [tok for tok in line if tok not in stop_words] 

In [None]:
output_v2 = eda(transform_txt_file_v2)

In [None]:
book_lengths2, avg_num_tokens2, dictionary2, dictionary_length2, \
    unique_toks_per_fileid2, avg_unique_toks2, corpus2 = output_v2

In [None]:
print "Average number of tokens in a book: ", avg_num_tokens2
print "   "
print "Average unique tokens in a book: ", avg_unique_toks2
print "   "
print "Total number of words (dictionary length): ", dictionary_length2

In [None]:
#make graph of reduction

In [None]:
save_stuff('no_stopwords', dictionary2, corpus2)

## EDA Step 4: + frequency filters

In [None]:
def eda_w_filter(transform_txt_file, fileid_lst=fileid_lst):
    '''
    Do transformations with updated transformation function and return all the eda items
    '''
    
    all_transf_books_lst = [transform_txt_file(f) for f in fileid_lst]
    
    book_lengths = [(tup[0], len(tup[1])) for tup in zip(fileid_lst, all_transf_books_lst)]
    avg_num_tokens = int(np.mean([len(book) for book in all_transf_books_lst]))
    
    dictionary = corpora.Dictionary(all_transf_books_lst)
    dictionary.filter_extremes(no_below=1)
    dictionary_length = len(dictionary)
    
    corpus = [dictionary.doc2bow(book) for book in all_transf_books_lst]
    
    unique_toks_num_lst = [len(book) for book in corpus]
    unique_toks_per_fileid = zip(fileid_lst, unique_toks_num_lst)
    avg_unique_toks = int(np.mean(unique_toks_num_lst))
    
    
    return book_lengths, avg_num_tokens, dictionary, dictionary_length, unique_toks_per_fileid, avg_unique_toks, corpus


def save_stuff(distinguishing_str, dictionary, corpus, model, outputs_dir=outputs_dir):
    '''
    Save the outputs of the most recent eda step
    '''
    if dictionary != None:
        dictionary.save(outputs_dir + distinguishing_str + '.dict')
        
    if corpus != None:
        corpora.MmCorpus.serialize(outputs_dir + distinguishing_str + '_corpus.mm', corpus)
    
    if model != None:
        pass

In [None]:
outputs4 = eda_w_filter(transform_txt_file_v2)

In [None]:
book_lengths4, avg_num_tokens4, dictionary4, dictionary_length4, \
    unique_toks_per_fileid4, avg_unique_toks4, corpus4 = outputs4

In [None]:
print "Average number of tokens in a book: ", avg_num_tokens4, avg_num_tokens2
print "   "
print "Average unique tokens in a book: ", avg_unique_toks4, avg_unique_toks2
print "   "
print "Total number of words (dictionary length): ", dictionary_length4, dictionary_length2

In [None]:
save_stuff('frequency_filtered', dictionary4, corpus4, model=None)

## Dimensionality Reduction Summary:

In [None]:
book_lengths4, avg_num_tokens4, dictionary4, dictionary_length4, \
    unique_toks_per_fileid4, avg_unique_toks4, corpus4 = outputs4

In [None]:
print "Average unique words per book: "
print "   ", "Initial (tokenized): ", avg_num_tokens1
print "   ", "Stop words removed: ", avg_num_tokens2
print "   ", "Frequency filtered: ", avg_num_tokens4
print "   "
print "Average unique words per book: "
print "   ", "Initial (tokenized): ", avg_unique_toks1
print "   ", "Stop words removed: ", avg_unique_toks2
print "   ", "Frequency filtered: ", avg_unique_toks4
print "   "
print "Vocabulary length: "
print "   ", "Initial (tokenized): ", dictionary_length1
print "   ", "Stop words removed: ", dictionary_length2
print "   ", "Frequency filtered: ", dictionary_length4


In [None]:
avg_num_tokens_lst = [avg_num_tokens1, avg_num_tokens2, avg_num_tokens4]
avg_unique_toks_lst =  [avg_unique_toks1, avg_unique_toks2, avg_unique_toks4]
vocab_size_lst =[dictionary_length1, dictionary_length2, dictionary_length4]
avg_num_tokens_lst, avg_unique_toks_lst, vocab_size_lst

In [None]:
tokenized_lst = [avg_unique_toks1, avg_num_tokens1, dictionary_length1]
stop_words_removed_lst =  [avg_unique_toks2, avg_num_tokens2, dictionary_length2]
frequency_filtered_lst =[avg_unique_toks4, avg_num_tokens4, dictionary_length4]
tokenized_lst, stop_words_removed_lst, frequency_filtered_lst

In [None]:
import matplotlib.pyplot as plt
#%matplotinline

In [None]:
n_groups = 3
pos = list(range(n_groups))

dim_red_1 = tokenized_lst

dim_red_2 = stop_words_removed_lst

dim_red_3 = frequency_filtered_lst


fig, ax = plt.subplots()

pos = list(range(n_groups))
bar_width = 0.25

opacity = 0.4
error_config = {'ecolor': '0.3'}

rects1 = plt.bar(pos, 
                 dim_red_1, bar_width,
                 alpha=opacity,
                 #color='b',
                 label='Initial')

rects2 = plt.bar([p + bar_width for p in pos], 
                 dim_red_2, bar_width,
                 alpha=opacity,
                 #color='b',
                 label='Stop Word Removal')

rects3 = plt.bar([p + bar_width*2 for p in pos], 
                 dim_red_3, bar_width,
                 alpha=opacity,
                 #color='b',
                 label= 'Frequency Filtering')

#plt.xlabel('Group')
plt.ylabel('Number of Words')
plt.title('Counts Over Dimensionality Reduction')
plt.xticks(index + bar_width / 2, ('Avg unique per book', \
                        'Avg total per book', 'Total vocab size'))
plt.legend()

plt.tight_layout()
plt.show()

## LDA Model

In [None]:
from gensim.models import ldamodel
from gensim.models import LdaMulticore

In [None]:
lda = ldamodel.LdaModel(corpus=corpus4,alpha='auto', id2word=dictionary2, \
                        num_topics=100, update_every=0, passes=20)
%time

In [None]:
name = 'lda.model'
lda.save(outputs_dir + name)


In [None]:
print lda

In [None]:
lda_multi = LdaMulticore(corpus=corpus4,alpha='auto', id2word=dictionary4, \
                        num_topics=100, update_every=0, passes=20, workers=4)
%time

In [None]:
name = 'lda_multi.model'
lda_multi.save(outputs_dir + name)


In [None]:
print lda_multi

See the topics and their most significant terms

In [None]:
lda.show_topics()
#also lda.show_topics() - what is difference?

For a given document (in bow format), see most relevant topics

In [None]:
lda.get_document_topics(dictionary2.doc2bow(transform_txt_file_v2('1080-clean.txt')))

For a given term in vocab, see what topics are most likely/relevant

In [None]:
lda.get_term_topics(100, minimum_probability=0.00001)

Inverse of above

In [None]:
lda.get_topic_terms(1)

In [None]:
for tup in lda.get_topic_terms(0):
    print dictionary2[tup[0]]

In [None]:
#Document alignment
#softmax - zeroing - zero out all not in the top2 alignment
    #manually creat label from top word
    #need to include logic to use two topics if below certain threshhold
#projection - 
#cos sim