In [1]:
import pandas as pd
import re
import collections

## Idea: Profile the data
- Total documents
- Total word occurences
- Vocab size
- Words occuring > 1000 times
- Words occuring once

#1. Lower case
#2. Tokenize
#3. Remove stop words
#4. Stemming

In [2]:
doc_dump_file = open('doc_dump.txt', "r", encoding="utf8")
nfdump_file= open('nfdump.txt', "r", encoding="utf8")

In [3]:
#ref.: https://stackoverflow.com/questions/29312508/how-do-i-remove-duplicate-words-from-a-list-in-python-without-using-sets

def get_unique_tokens(file):
    input_file = open(file, "r", encoding="utf8")
    allWords = list()      #create new list
    vocab_tokens = list()

    for line in input_file:
        line.rstrip()   #strip white space
        words = line.split()   #split lines of words and make list
        allWords.extend(words)   #make the list from 4 lists to 1 list

    for word in allWords:   #for each word in line.split()
        word = re.sub(r'\b[^\W\d_]+\b','',word)
        word=word.lower().strip()
        if not word.isdigit():
            if word not in vocab_tokens:    #if a word isn't in line.split            
                vocab_tokens.append(word)   #append it.
    
    return vocab_tokens

In [4]:
def print_corpus_statistics(file, most_common):
    tokens_occuring_once = list()
    tokens_occuring_1000_plus = list()
    #ref. https://stackoverflow.com/questions/25985299/create-python-dictionary-from-text-file-and-retrieve-count-of-each-word
    with open(file) as f:
        c = collections.Counter(
            word.lower()
            for line in f
                for word in re.findall(r'\b[^\W\d_]+\b', line))
    collection_len=len(c)
    
    vocab_size=len(get_unique_tokens(file))
    
    print ("Total word occurences: %d"%sum(c.values()))
    print ("Vocabulary size: %d"%vocab_size)

    print ('Most common words:')
    for letter, count in c.most_common(most_common):
        print ('%s: %7d' % (letter, count))
        
    for letter, count in c.most_common(collection_len):
        if count > 1000:
            tokens_occuring_1000_plus.append(letter)
        if count == 1:
            tokens_occuring_once.append(letter)
        
    print ("There are %d words occuring > 1000 times"%len(tokens_occuring_1000_plus))
    print ("There are %d words occuring once"%len(tokens_occuring_once))

In [5]:
print_corpus_statistics('doc_dump.txt',10)

Total word occurences: 1291434
Vocabulary size: 24317
Most common words:
of:   52806
the:   52621
and:   51085
in:   34773
to:   23497
a:   19891
with:   14393
for:   12766
were:   10567
was:    9960
There are 164 words occuring > 1000 times
There are 5474 words occuring once


In [6]:
print_corpus_statistics('nfdump.txt',10)

Total word occurences: 6047074
Vocabulary size: 33872
Most common words:
the:  215366
and:  140541
to:  136859
of:  129305
a:  109183
i:  107960
http:   91192
in:   88789
is:   82461
org:   79254
There are 727 words occuring > 1000 times
There are 30710 words occuring once


# Preprocess all docs and queries

In [7]:
stop_words=set(line.strip() for line in open('stopwords.large'))


In [9]:
def remove_digits(line):
    words_to_keep=[]
    words=line.split()
    for word in words:
        if not word.isdigit():
            words_to_keep.append(word)
    return ' '.join(words_to_keep)

In [10]:
#preprocess_line(test)

In [11]:
def clean_url(text):
    text = re.sub(r'^//www?.*/([\d]{8})', '', text)
    #case http://www.ncbi.nlm.nih.gov/pubmed/15072585
    text = re.sub(r'http://www.ncbi?(.{0,20})([\d]{8})', '', text)
    #
    text = re.sub(r'^http?:\/\/.*[\r\n]*/', '', text)
    #case http://nutritionfacts.org/topics/heart-health/
    text = re.sub(r'(http:)?//nutritionfacts.org(.{0,20})(/{1})', '', text)

    return text

In [12]:
def preprocess_line(line):
    #print(line)
    line=clean_url(line)
    words=line.split()
    words_to_keep=[]
    for word in words:
        word=clean_url(word)
        word = re.sub(r'[^\w\s]',' ',word)
        word=word.lower().strip()
        if word not in stop_words:
                words_to_keep.append(word)
        new_line=' '.join(words_to_keep)
    return remove_digits(new_line)
    

In [13]:
def parse_docs(lines, keyword):
    data=[]    
    for line in lines:
        docid, line = line.split('http:', 1)
        
        if keyword in docid:
            docid=docid.replace('\t', '')
            line=preprocess_line(line)
            data.append((docid,line))
    return data          

In [14]:
def get_preprocessed_docs(file, keyword):
    data = parse_docs(file.readlines(), keyword)
    return data

In [15]:
data_docs=get_preprocessed_docs(doc_dump_file, 'MED')

In [16]:
len(data_docs)

5371

In [28]:
data_docs[0]

('MED-1',
 'birth weight head circumference prenatal exposure acrylamide maternal diet european prospective mother child study newgeneris abstract background acrylamide common dietary exposure crosses human placenta classified probable human carcinogen developmental toxicity observed rodents objectives examined associations prenatal exposure acrylamide birth outcomes prospective european mother child study methods hemoglobin hb adducts acrylamide metabolite glycidamide measured cord blood reflecting cumulated exposure months pregnancy singleton pregnant women recruited denmark england greece norway spain maternal diet estimated food frequency questionnaires results acrylamide glycidamide hb adducts statistically significant reduction birth weight head circumference estimated difference birth weight infants highest versus lowest quartile acrylamide hb adduct levels adjusting gestational age country ci difference head circumference cm ci findings similar infants nonsmokers consistent cou

In [17]:
data_queries=get_preprocessed_docs(nfdump_file, 'PLAIN')

In [18]:
len(data_queries)

3437

# Split according to ids

## Split docs

In [19]:
train_ids=pd.read_csv('train.docs.ids', header=None)
train_ids.columns=['id']

In [20]:
train_ids_list = train_ids['id'].tolist()

In [21]:
test_ids=pd.read_csv('test.docs.ids', header=None)
test_ids.columns=['id']

In [22]:
test_ids_list = test_ids['id'].tolist()

In [23]:
dev_ids=pd.read_csv('dev.docs.ids', header=None)
dev_ids.columns=['id']

In [24]:
dev_ids_list = dev_ids['id'].tolist()

In [37]:
def get_split(docs_file, train_ids_list, test_ids_list, dev_ids_list):
    train_docs=[]
    test_docs=[]
    dev_docs=[]
    
    for doc in docs_file:
        doc_id, text= doc
        if doc_id in train_ids_list:
            train_docs.append((doc_id, text))
        if doc_id in test_ids_list:
            test_docs.append((doc_id, text))
        if doc_id in dev_ids_list:
            dev_docs.append((doc_id, text))
    return train_docs, test_docs, dev_docs

In [26]:
train_docs, test_docs, dev_docs= get_split(data_docs, train_ids_list, test_ids_list, dev_ids_list)

In [29]:
len(train_docs)

3612

In [30]:
len(test_docs)

3162

In [31]:
len(dev_docs)

3193

## Split queries

In [34]:
train_ids_q=pd.read_csv('train.queries.ids', header=None)
train_ids_q.columns=['id']
train_ids_list_q = train_ids_q['id'].tolist()

In [35]:
test_ids_q=pd.read_csv('test.queries.ids', header=None)
test_ids_q.columns=['id']
test_ids_list_q = test_ids_q['id'].tolist()

In [36]:
dev_ids_q=pd.read_csv('dev.queries.ids', header=None)
dev_ids_q.columns=['id']
dev_ids_list_q = dev_ids_q['id'].tolist()

In [38]:
train_queries, test_queries, dev_queries= get_split(data_queries, train_ids_list_q, test_ids_list_q, dev_ids_list_q)

In [41]:
len(train_queries)

2594

In [45]:
train_queries[0]

('PLAIN-3',
 'breast cancer cells feed on cholesterol breast cancer cells feed cholesterol american women diagnosed breast cancer lifetime number compounds plant foods protect breast cancer variety mechanisms i ve talked benefits broccoli flaxseeds soy foods breast cancer survival vegetable flaxseeds breast cancer prevention breast cancer survival soy recent german study reported researchers found sunflower pumpkin seeds reduced breast cancer risk initially chalked association lignans seeds breast cancer survival lignan intake lignan lead didn t pan it s phytosterols found concentrated seeds optimal phytosterol source evidence phytosterols anticancer nutrients play role reducing breast cancer risk thought phytosterols lowered cholesterol phytosterols lower cholesterol cancer cholesterol increasing evidence demonstrates role cholesterol play development progression breast cancer cancer feeds cholesterol transformed cells ldl so called bad cholesterol it s capable stimulating growth huma

In [42]:
len(test_queries)

325

In [43]:
len(dev_queries)

325