In [1]:
import re
from functools import reduce
RESULT_FILE = 'query_results.txt'
WORD_FILE_DIR = 'word_files'

In [15]:
def file_reader(file):
    with open(file) as infile:
        for line in infile:
            print(line)

In [16]:
def write_result_to_file(result):
    # write to file here.
    file = open(RESULT_FILE, 'a')
    file.write(result +'\n')
    file.close()

In [26]:
def find_occurrence_in_articles(word):
    """
    Reads in the word file and grabs all the article ids.
    Returns all article ids for the word in a list.
    If no word found is found None is returned
    """
    try:
        article_ids = open(WORD_FILE_DIR + '/' + word + '.txt', 'r').read().split('\n')
        return article_ids[:len(article_ids)-1]
    except FileNotFoundError:
        print('No file found for word')
        return None

In [18]:
def find_common_articles(word_list):
    """
    Takes all words in query are argument and find all articels for each word.
    Returns a list of articles where all words intersects.
    """
    article_ids_list = []
    for word in word_list:
        occurences = find_occurrence_in_articles(word)
        if occurences is not None:
            article_ids_list.append(occurences)
        
    
    return find_intersection_of_all_article_ids(article_ids_list)

In [19]:
def find_intersection_of_all_article_ids(article_ids_list):
    """
    Returns the intersection of all the article ids
    """
    result = reduce(set.intersection, map(set, article_ids_list))
    return result

In [20]:
def query(string, *args):
    """
    Takes a string from the file as first parameter.
    The args parameter is the query it self in the format of:
    ['cat', [2, 4], 'hat']
    """
    if args is None:
        return
    
    for query in args:
        # seperate query into parameters
        start_word = query[0]
        end_word = query[2]
        lower_bound = query[1][0]
        upper_bound = query[1][1]
        
        # build regexs
        start_word_regex = r"\b" + re.escape(start_word) + r"\b"
        end_word_regex = r"\b" + re.escape(end_word) + r"\b"
        
        # find substring positions
        start_word_pos = re.search(start_word_regex, string)
        end_word_pos = re.search(end_word_regex, string)

        # calc distance between words
        char_between = end_word_pos.start() - start_word_pos.end()

        if char_between >= lower_bound and char_between <= upper_bound:
            result = string[start_word_pos.start(): end_word_pos.end()]
            # write result to file
            write_result_to_file(result)

In [21]:
test_string = 'I have a really nice cat in hat at home'
test_query = ['cat', [2, 4], 'hat']
test_query2 = ['I', [2, 6], 'a']
query(test_string, test_query, test_query2)

In [22]:
def strip_query_intervals(query):
    """
    Will remove any number of substrings from the string which is surrounded by [ ].
    """
    stripped_version = query
    if query.find('[') != -1:
        stripped_version = query[0:query.find('[')] + query[query.find(']')+1:len(query)]
        return strip_query_intervals(stripped_version)
    else:
        return stripped_version
        

In [23]:
def return_unique_query_words(query):
    """
    Making sure the list of ids only contain every word once so the same word wont be checked twice.
    Converting the list to set.
    """
    query_words = strip_query_intervals(query)

    return set(query_words.split())
    

In [24]:
def main_function(query):
    """
    This is the main function. Firstly the query is being stripped for the intervals and chopped into
    substrings containing only 1 word each. Then the articles occur in is found and intersected with the other
    words articles. Finally the function loops through all the intersected articles to see if the query holds.
    """
    all_word_article_ids = []
    
    for word in return_unique_query_words(query):
        all_word_article_ids.append(find_occurrence_in_articles(word))
    
    print(all_word_article_ids)
    common_article_ids = find_common_articles(all_word_article_ids)
    print(common_article_ids)
    
    #Run the query function for each article with the original query: query(article_body,query)
        

In [27]:
main_function('cat [2, 4] hat')

No file found for word
[['89', '12', 'test_xml/Wikipedia-20170926101621', 'test_xml/Wikipedia-20170926101621', 'test_xml/Wikipedia-20170926135213'], None]


TypeError: must be str, not list

In [14]:
if __name__ == "__main__":
    import sys
    query_string = (sys.argv[1])

    main_function(query_string)

FileNotFoundError: [Errno 2] No such file or directory: 'word_files/-f.txt'