In [1]:
import os, codecs
import gensim
from gensim import corpora
from collections import defaultdict
import string
from string import punctuation
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import pandas as pd
import numpy as np

Using TensorFlow backend.


In [2]:
class IterFile(object):
    '''
    class object to do the iterating on individual book txt documents, including file i/o.
    '''
    
    def __init__(self, filepath):
        self.filepath = filepath
        
    def _open_file(self):
        self.file = codecs.open(self.filepath, 'r', encoding='utf_8')
        
    def _close_file(self):
        self.file.close()
    
    def __iter__(self):
        '''
        overwrite iteration to include file i/o
        '''
        self._open_file()
        
        for line in self.file:
            yield line
        
        self._close_file()
    

In [6]:
source_dir = '/Users/rachelbrynsvold/dsi/capstone_dir/top_100_dev_corp/books/clean/'
outputs_dir = '/Users/rachelbrynsvold/dsi/capstone_dir/top_100_dev_corp/outputs/'

In [5]:
temp_corp = PlaintextCorpusReader(source_dir, '.*\.txt')
fileid_lst = temp_corp.fileids()
fileid_lst

['10-clean.txt',
 '100-clean.txt',
 '105-clean.txt',
 '108-clean.txt',
 '1080-clean.txt',
 '11-clean.txt',
 '1112-clean.txt',
 '1184-clean.txt',
 '12-clean.txt',
 '120-clean.txt',
 '1232-clean.txt',
 '1260-clean.txt',
 '1322-clean.txt',
 '1342-clean.txt',
 '135-clean.txt',
 '1399-clean.txt',
 '140-clean.txt',
 '1400-clean.txt',
 '1404-clean.txt',
 '14264-clean.txt',
 '147-clean.txt',
 '1497-clean.txt',
 '15399-clean.txt',
 '158-clean.txt',
 '16-clean.txt',
 '160-clean.txt',
 '161-clean.txt',
 '16382-clean.txt',
 '1661-clean.txt',
 '1727-clean.txt',
 '174-clean.txt',
 '1952-yellow_wallpaper-clean.txt',
 '19942-clean.txt',
 '20-clean.txt',
 '20203-clean.txt',
 '203-clean.txt',
 '205-clean.txt',
 '21279-clean.txt',
 '2148-clean.txt',
 '2174-clean.txt',
 '219-clean.txt',
 '224-clean.txt',
 '23-clean.txt',
 '236-clean.txt',
 '2500-clean.txt',
 '25305-clean.txt',
 '2591-clean.txt',
 '2600-clean.txt',
 '2680-clean.txt',
 '2701-moby-clean.txt',
 '28054-clean.txt',
 '2814-clean.txt',
 '2852-cle

In [7]:
def eda(transform_txt_file, fileid_lst=fileid_lst):
    '''
    Do transformations with updated transformation function and return all the eda items
    '''
    
    all_transf_books_lst = [transform_txt_file(f) for f in fileid_lst]
    
    
    book_lengths = [(tup[0], len(tup[1])) for tup in zip(fileid_lst, all_transf_books_lst)]
    
    avg_num_tokens = int(np.mean([len(book) for book in all_transf_books_lst]))
    
    
    dictionary = corpora.Dictionary(all_transf_books_lst)
    
    dictionary_length = len(dictionary)
    
    
    corpus = [dictionary.doc2bow(book) for book in all_transf_books_lst]
    
    
    unique_toks_num_lst = [len(book) for book in corpus]

    unique_toks_per_fileid = zip(fileid_lst, unique_toks_num_lst)
    
    avg_unique_toks = int(np.mean(unique_toks_num_lst))
    
    
    return book_lengths, avg_num_tokens, dictionary, dictionary_length, unique_toks_per_fileid, avg_unique_toks


def save_stuff(dictionary, corpus, distinguishing_str, outputs_dir='/Users/rachelbrynsvold/dsi/capstone_dir/top_100_dev_corp/outputs/'):
    '''
    Save the outputs of the most recent eda step
    '''
    
    dictionary.save(outputs_dir + distinguishing_str + '.dict')
    
    corpora.MmCorpus.serialize(outputs_dir + distinguishing_str + '_corpus.mm', corpus)
    

In [11]:
def transform_txt_file_v1(fname, root=source_dir):
    '''
    Top-level function to call all of the subfunctions for text transformation
    Assumes you want to remove empty lines and tokenize (because you do)
    v1 just repeats v0 functionality with new subfunction architecture; checking to make sure I did it right
    '''
    fp = root + fname

    book_as_lst = []
    for line in IterFile(fp):
        if empty_line_check(line) == True:
            book_as_lst.extend(basic_tokenize(line))
        
    return book_as_lst
     

def empty_line_check(line) :
    '''
    checks for empty line
    '''
    if line == "\n":
        empty = True
    else:
        empty = False
    return empty

    
def basic_tokenize(line):
    '''
    convert to list
    strip punctuation, lowercase
    '''
    return [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]
        
            
def remove_stop_words():
    pass
    
def lemmatize():
    pass
    
    

In [12]:
book_lengths, avg_num_tokens, dictionary, dictionary_length, unique_toks_per_fileid, avg_unique_toks = \
    eda(transform_txt_file_v1)