In [1]:
import re
from functools import reduce
RESULT_FILE = 'query_results.txt'
WORD_FILE_DIR = 'word_files'
ARTICLE_DIR = 'articles'

In [2]:
def write_result_to_file(result):
    """
    """
    file = open(RESULT_FILE, 'a')
    file.write(result + '\n')
    file.close()

In [3]:
def find_occurrence_in_articles(word):
    """
    Reads in the word file and grabs all the article ids.
    Returns all article ids for the word in a list.
    If no word found is found None is returned
    """
    article_ids = open(WORD_FILE_DIR + '/' + word + '.txt', 'r').read().split('\n')
    return article_ids[:len(article_ids)-1]

In [4]:
def find_common_articles(word_list):
    """
    Takes all words in query are argument and find all articels for each word.
    Returns a list of articles where all words intersects.
    """
    article_ids_list = []
    for word in word_list:
        article_ids_list.append(find_occurrence_in_articles(word))
        
    return find_intersection_of_all_article_ids(article_ids_list)

In [5]:
def find_intersection_of_all_article_ids(article_ids_list):
    """
    Returns the intersection of all the article ids
    """
    result = reduce(set.intersection, map(set, article_ids_list))
    return result

In [15]:
def query(string, *args):
    """
    Takes a string from the file as first parameter.
    The args parameter is the query it self in the format of:
    ['cat' [2, 4] 'hat']
    """
    if args is None:
        return
    
    for query in args:
        # seperate query into parameters
        start_word = query[0]
        end_word = query[2]
        lower_bound = query[1][0]
        upper_bound = query[1][1]
        
        # build regexs
        start_word_regex = r"\b" + re.escape(start_word) + r"\b"
        end_word_regex = r"\b" + re.escape(end_word) + r"\b"
        
        #while the start word is in the string
        while string.find(start_word) != -1:
            
            # find substring positions
            start_word_pos = re.search(start_word_regex, string)
            end_word_pos = re.search(end_word_regex, string)

            if start_word_pos is not None and end_word_pos is not None:
                # calc distance between words
                char_between = end_word_pos.start() - start_word_pos.end()

                #if the substring is within the string
                if char_between >= lower_bound and char_between <= upper_bound:
                    print(string[start_word_pos.start(): end_word_pos.end()])
                    string = string[0:start_word_pos.start()] + string[end_word_pos.end():len(string)]
                #if the substring is not in the string remove the s
                else:
                    string = string[0:start_word_pos.start()] + string[start_word_pos.end():len(string)]
            
   
        ## find substring positions
        #start_word_pos = re.search(start_word_regex, string)
        #end_word_pos = re.search(end_word_regex, string)

        ## calc distance between words
        #char_between = end_word_pos.start() - start_word_pos.end()

        #if char_between >= lower_bound and char_between <= upper_bound:
            #result = string[start_word_pos.start(): end_word_pos.end()]
            # write result to file
            #write_result_to_file(result)
            #print(result)

In [None]:
test_string = 'I have a really nice cat cat in hat at home cat in hat'
test_query = ['cat', [2, 4], 'hat']
test_query2 = ['I', [2, 6], 'a']
query(test_string, test_query, test_query2)

In [7]:
def strip_query_intervals(query):
    """
    Will remove any number of substrings from the string which is surrounded by [ ].
    """
    stripped_version = query
    if query.find('[') != -1:
        stripped_version = query[0:query.find('[')] + query[query.find(']')+1:len(query)]
        return strip_query_intervals(stripped_version)
    else:
        return stripped_version

In [8]:
def return_unique_query_words(query):
    """
    Making sure the list of ids only contain every word once so the same word wont be checked twice.
    Converting the list to set.
    """
    query_words = strip_query_intervals(query)

    return set(query_words.split())

In [9]:
def testIfNumber(x):
    try:
        int(x)
        return True
    except:
        return False

In [10]:
def preprocess_query(query):    
    splittet_query = re.split('[ ]', query)
    clean_query = []
    for element in splittet_query:
        element = element.replace('[', '')
        element = element.replace(']', '')
        element = element.replace(',', '')
        clean_query.append(element)

    res_query = []

    for (idx, element) in enumerate(clean_query):
        if testIfNumber(element):
            if testIfNumber(clean_query[idx+1]):
                # get numbers here
                new_list = []
                new_list.append(int(element))
                new_list.append(int(clean_query[idx+1]))
                res_query.append(new_list)
        else:
            res_query.append(element)

    print(res_query)
    return res_query

In [13]:
def main_function(query_param):
    """
    This is the main function. Firstly the query is being stripped for the intervals and chopped into
    substrings containing only 1 word each. Then the articles occur in is found and intersected with the other
    words articles. Finally the function loops through all the intersected articles to see if the query holds.
    """

    try:
        word_list = return_unique_query_words(query_param)
        common_articles = find_common_articles(word_list)
    except FileNotFoundError:
        print('File not found!')
    except TypeErrir:
        print('Could not find intersection of empty list')
    
    #Run the query function for each article
    for article in common_articles:
        text = open(ARTICLE_DIR + '/' + article + '.txt', 'r').read()
        preprocessed_query = preprocess_query(query_param)
        query(text, preprocessed_query)


In [None]:
main_function('cat [2, 4] noise')

['cat', [2, 4], 'noise']


In [None]:
if __name__ == "__main__":
    import sys
    query_string = (sys.argv[1])

    main_function(query_string)