In [20]:
# Authors: Rishiraj Kanungo and Bubba Schultz

# import dependencies
import numpy as np
import pandas as pd
import string

In [21]:
def open_file():
    """
    Prompt the user for a file-name, and try to open that file.
    If the file exists, it will return the file object; 
    otherwise it will re-prompt until it can successfully open the file.

    Params:
    None
    """
    
    while True:
        # get file name and append extension if necessary
        user_input = input("Please enter the name of the file that you would like to open: \n")
        extension = '.txt'
        if extension not in user_input:
            user_input = user_input + extension
        
        # continue to ask for a valid file name
        try:
            fp = open(user_input, 'r')
            break
        except OSError:
            print("Could not open/read", user_input)
    
    return fp

In [22]:
def read_data(fp):
    """
    This function will read the contents of that file line by line, 
    process them, and store them in a dictionary and returned.

    Params:
    fp: file pointer
    """
    
    # dict to be returned
    d = {}
    
    # data preprocessing
    lineNumber = 0
    for line in fp:
        lineNumber += 1
        new_line = line.translate(str.maketrans('', '', string.punctuation)) # remove punct
        line = new_line.lower() # make line lower
        words = line.split() # split line into words
        
        # go through each word and remove words with len < 2 and non-alphas
        for word in words:
            if not word.isalpha():
                words.remove(word)
        for word in words:
            if len(word) < 2:
                words.remove(word)
          
        # store words into dict
        for word in words:
            if word not in d:
                d[word] = set()
                d[word].add(lineNumber)
            else:
                d[word].add(lineNumber)
        
    return d
        

In [23]:
def find_cooccurance(D, inp_str):
    """
    split inp_str into a list of words, and find the line numbers for each word
    where each one is present.

    Params:
    D: dictionary returned by read_file()
    inp_str: zero or more words separated by white space
    """
    
    # list of words and intersection and result of intersection
    words = []
    line_intersection = []
    result = ""   
    original_string = inp_str
    
    # preprocess string to align with textfile preprocessing
    inp_str = inp_str.translate(str.maketrans('', '', string.punctuation))
    inp_str = inp_str.lower()
    
    # split the words into a list
    words = inp_str.split()
    
    # iterate through and add valid values to intersection set list
    for word in words:
        if word in D:
            line_intersection.append(D[word])
    
    # perform intersection function on existing values in list of sets
    if len(line_intersection) > 0:
        result = set.intersection(*line_intersection)
        print("The co-occurance for:", original_string)
        print("Lines:", end=' ')
        print(*result, sep=', ')
    else:
        print("The co-occurance for:", original_string)
        print("Lines: None")

In [24]:
# main driver function
def main():
    fp = open_file()
    d = read_data(fp)
    while True:
        inp_str = input("Enter space-separated words: \n")
        if inp_str == "q":
            break
        elif inp_str == "Q":
            break
        else:
            find_cooccurance(d, inp_str)

In [25]:
if __name__ == "__main__":
    main()

Please enter the name of the file that you would like to open: 
xxxx
Could not open/read xxxx.txt
Please enter the name of the file that you would like to open: 
einstein.txt
Enter space-separated words: 
The
The co-occurance for: The
Lines: 3, 4, 7
Enter space-separated words: 
can't
The co-occurance for: can't
Lines: 6
Enter space-separated words: 
nature
The co-occurance for: nature
Lines: 2
Enter space-separated words: 
cat
The co-occurance for: cat
Lines: None
Enter space-separated words: 

The co-occurance for: 
Lines: None
Enter space-separated words: 
Q
