In [2]:
import os
from gensim import corpora, models, similarities, matutils, interfaces, utils
import logging
from nltk.corpus import stopwords
from collections import defaultdict
from nltk import word_tokenize

STOP_WORDS = stopwords.words('english')

DATA_PATH = ('./pmc_data/pmc_text_files/')
GENRE_FOLDERS = os.listdir(DATA_PATH)
ARTICLE_FILE_PATHS = []
ARTICLE_FILE_TITLES = []
ARTICLE_DOCUMENT_LIST = []



def load_article_paths():
    genre_folders_left = len(GENRE_FOLDERS)
    completed_genre_folders = 0 
    for genre_folder in GENRE_FOLDERS:
        completed_genre_folders += 1
        genre_folder_path = os.path.join(DATA_PATH, genre_folder)
        genre_file_list = os.listdir(genre_folder_path)
        for article_file_title in genre_file_list:
            article_file_path = os.path.join(genre_folder_path, article_file_title)
            if os.path.isfile(article_file_path):
                ARTICLE_FILE_TITLES.append(article_file_title)
                ARTICLE_FILE_PATHS.append(article_file_path)
#                 with open(article_file_path, 'rb') as f:
#                     document = f.read()
#                     ARTICLE_DOCUMENT_LIST.append(document)

                print "done with: ", article_file_title
                print "progress: ", completed_genre_folders / float(genre_folders_left)
            else:
                sub_article_folder_list = os.listdir(article_file_path)
                for sub_article_file_title in sub_article_folder_list:
                    sub_article_file_path = os.path.join(article_file_path, 
                            sub_article_file_title)
                    ARTICLE_FILE_TITLES.append(sub_article_file_title)
                    ARTICLE_FILE_PATHS.append(sub_article_file_path)
#                     with open(sub_article_file_path, 'rb') as f:
#                         document = f.read()
#                         ARTICLE_DOCUMENT_LIST.append(document)

def to_unicode_or_bust(
        obj, encoding='utf-8'):
    if isinstance(obj, basestring):
        if not isinstance(obj, unicode):
            obj = unicode(obj, encoding)
    return obj
                        
load_article_paths()

done with:  Arch_Dis_Child_2007_Apr_11_92(4)_298-303.txt
progress:  0.5
done with:  Arch_Dis_Child_2008_Aug_18_93(8)_654-659.txt
progress:  0.5
done with:  Arch_Dis_Child_2008_Jul_1_93(7)_566-569.txt
progress:  0.5
done with:  Arch_Dis_Child_2008_Jul_1_93(7)_620-625.txt
progress:  0.5
done with:  Arch_Dis_Child_2008_May_17_93(5)_373-378.txt
progress:  0.5
done with:  Arch_Dis_Child_2008_Oct_2_93(10)_845-850.txt
progress:  0.5
done with:  Arch_Dis_Child_2008_Oct_30_93(10)_890-898.txt
progress:  0.5
done with:  Arch_Dis_Child_2009_Feb_14_94(2)_110-116.txt
progress:  0.5
done with:  Arch_Dis_Child_2009_Feb_19_94(2)_156-160.txt
progress:  0.5
done with:  Arch_Dis_Child_2009_Jan_25_94(1)_11-15.txt
progress:  0.5
done with:  Arch_Dis_Child_2009_Jan_9_94(1)_42-46.txt
progress:  0.5
done with:  Arch_Dis_Child_2009_Nov_3_94(11)_831-833.txt
progress:  0.5
done with:  Arch_Dis_Child_2009_Oct_29_95(6)_414-421.txt
progress:  0.5
done with:  Arch_Dis_Child_2010_Aug_12_96(1)_30-37.txt
progress:  0.5


In [2]:
logging.basicConfig(format='%(asctime)s : %(Levelname)s : %(message)s', level=logging.INFO)

In [3]:
documents = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey"]


In [4]:
frequency = defaultdict(int)

In [5]:
texts = [[word for word in document.lower().split() if word not in STOP_WORDS]
         for document in documents]

# Let's try to only use gensim

### Tokenize Document

In [7]:
texts

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

### Put into dictionary

- It is possible to add to the dictionary **gradually** this is **memory friendly**

In [19]:
dictionary = corpora.Dictionary()

In [20]:
dictionary.add_documents(texts[0:1])

In [23]:
for i in range(len(texts)):
    dictionary.add_documents(texts[i:i+1])

### Let's turn the dictionary into a corpus

Why are we doing this?,  
 - Right now we have a dictionary of counts of words 
 - what we need is a term document matrix

In [30]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(4, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(6, 1), (7, 1), (9, 1), (13, 1), (14, 1)],
 [(5, 1), (7, 2), (14, 1), (15, 1), (16, 1)],
 [(9, 1), (10, 1), (12, 1), (17, 1), (18, 1), (19, 1), (20, 1)],
 [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(25, 1), (26, 1), (27, 1), (28, 1)],
 [(25, 1), (26, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)],
 [(8, 1), (26, 1), (29, 1)]]

### Try this one document at a time ( Streaming)

In [35]:
class MyCorpus(object):
    def __iter__(self):
        for article_file in ARTICLE_FILE_PATHS:
            yield dictionary.doc2bow(line.lower().split())

In [53]:
corpus_memory_friendly = MyCorpus()

In [115]:
[vector for vector in corpus_memory_friendly]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(5, 1),
  (8, 1),
  (9, 2),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1)],
 [(7, 1), (10, 1), (12, 1), (16, 1), (17, 1), (18, 1)],
 [(6, 1), (9, 1), (10, 2), (18, 1), (19, 1), (20, 1), (21, 1)],
 [(9, 1),
  (12, 1),
  (13, 1),
  (15, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1)],
 [(9, 1), (16, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)],
 [(9, 1), (16, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1)],
 [(9, 1),
  (19, 1),
  (31, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1)],
 [(8, 1), (11, 1), (35, 1), (36, 1)]]

In [56]:
dictionary = corpora.Dictionary(line.lower().split() for line in open('gensim_example/mycorpus.txt'))

Now let's put it into a bag of words

In [3]:
class PubmedCorpus(corpora.TextCorpus):
        
    def get_texts(self):
        for filename in self.input:
            with open(filename, 'rb') as f:
                file_string = to_unicode_or_bust(f.read().lower())
                file_tokenized = word_tokenize(file_string)
                file_no_stops = [word for word in file_tokenized if word not in STOP_WORDS]
                yield file_no_stops

In [4]:
pubmed_corpus = PubmedCorpus(ARTICLE_FILE_PATHS)