##Datascience Homework 1##
Author: Aedan Wells

Date: 9/19/22

Brief: Program for a user to analyze specific protions of a large text file. The text file will be too large to just hold in memory, so it will be good to have a software that can process this well. Analysis includes word queries that yield if a word is in the specified paragraphs and where and paragraph queries that return the number of words in the paragraph, the first and last word in the paragraph as well as a random word.

This note book accomplishes everything the homework specifies including requested functions, word/paragraph queries, only parsing the text once, user prompt, and dealing with illegal entries.  

Notes:
*   When running this code, if you keyboard interrupt instead of ending the program with the termination command, please rerun the first cell of the playbook. Otherwise, the global DICTIONARY will still hold the previous dictionary and create strange output. If ran in a normal python file, this issue would not come up.
*   I solved the exponential number problem as posited in section 1 with numbers (123.45, and 1.2345E02 are equivalent to 123.45). However, after seeing textfile2, I did not implement latex style numeric changes (e.g. 5\times10^{-4}). I wanted to acknowledge the problems given to us but not deal with every possible numerical representation as that would be impossible. 



In [None]:
#import random and set global basic dictionary
import random as rand
DICTIONARY = {}

In [None]:
#clean up the word to make it common for application
#param word is the word being cleaned
#returns cleaned word
def clean_word(word):
  num = 0
  exp = 1
  #make the word lowercase
  word = word.lower()
  #if the string has a number, clean it like a number
  if True in [char.isdigit() for char in word]:
    #if needs exponential, multiply LHS by 10^RHS
    if 'e0' in word:
      splitted = word.split('e0')
      for i in range(int(splitted[1])):
        exp = exp * 10
      splitted[0] = splitted[0].strip(",$#?!<>\]\[\:\/\;()\"*&\^\%@")
      num = float(splitted[0]) * exp
      num = round(num, len(splitted[0]))
      return str(num)
    #if last character is a period, remove it. This allows decimals to exist without ending punctuation
    last_char = word[len(word) - 1]
    if last_char == ".":
      word = word[:-1]
    word = word.strip(",$#?!<>\]\[\:\/\;()\"*&\^\%@")
  else:
    word = word.strip(",.$#?!<>\]\[\:\/\;()\"*&\^\%@")
  return word

In [None]:
#generator function designed to return the content of the next paragraph from the file specified in its input.
#param filename is the name of the file being iterated on
#yields the paragraoh if paragraphs remain, 
def gen_read_para(filename):
  #read the file
  try:
    f = open(filename, 'r')
  except FileNotFoundError as missing_file_err:
    print(f'File is missing: *** {missing_file_err}***')
    exit(0)
  
  para = []
  while True:
    line = f.readline()
    #if EOF or just a new line
    if (line == '\n') or (line == ''):
      #if para has content, return that
      if para != []:
        yield(para)
        para = []
      # if EOF, return None
      if line == '':
        f.close()
        return None
    else:
      #grab the line, split the words and add to the list
      tmp = line.split()
      for i in range(len(tmp)):
        para.append(clean_word(tmp[i]))

In [None]:
#will return a list of words in the specified paragraph in their order of appearance
#param para_stream is the iterator from gen_read_para() function
#param para_num is the paragraph number we want the list of words from
#param prev_para_num last paragraph number to get the appropriate paragraph from the generator (no rereading)
#returns word_list, the list of words in the paragraph. Returns None if StopIteration is triggered
def get_words_in_para(para_stream, para_num, prev_para_num):
  try:
    word_list = []
    #range of current number - previous paragraph num, that yields how many to go forward
    for i in range(para_num - prev_para_num):
      word_list = next(para_stream)
    return word_list
  except StopIteration:
    return None

In [None]:
#return all the words in all the paragraphs in the input list
#param para_stream is the iterator from gen_read_para() function
#param para_num_list is the list of paragraph numbers being analyzed
#returns para_dict, a dictionary of each paragraph pointing to a list of the words in the paragraph in order
def get_words_in_all_paras(para_stream, para_num_list):
  para_dict = {}
  last_para = -1
  #for each paragraph, get the words
  for para in para_num_list:
    tmp = get_words_in_para(para_stream, para, last_para)
    #check here in case get_words returns None
    if tmp is not None:
      para_dict[para] = tmp
    last_para = para
  return para_dict

In [None]:
#Creates a global dictionary DICTIONARY in which every word w in the paragraphs 
#exist pointing to what paragraph they are in and where
#param para_stream is the iterator from gen_read_para() function
#param para_num_list is the list of paragraph numbers being analyzed
#returns paragraph_dict which is a dictionary of paragraph number pointing to the text in the paragraph
def analyze_paras(para_stream, para_num_list):
  global DICTIONARY
  paragraph_dict = get_words_in_all_paras(para_stream, para_num_list)
  #for each paragraph 
  for paragraph in paragraph_dict.keys():
    word_list = paragraph_dict[paragraph]
    word_offset = 0
    #for each word in the paragraph
    for word_new in word_list:
      #if the word is in the dictionary
      if word_new in DICTIONARY.keys():
        #if the paragraph is recorded for the word
        if paragraph in DICTIONARY[word_new].keys():
          #append the word offset for this paragraph
          DICTIONARY[word_new][paragraph].append(word_offset)
        #else if it does not have this paragraph, put that in and the offset  
        else:
          DICTIONARY[word_new].update({paragraph : []})
          DICTIONARY[word_new][paragraph].append(word_offset)
      #else if there is no word recorded, put it in plus the paragraph number
      else:
        DICTIONARY[word_new] = {paragraph : []}
        DICTIONARY[word_new][paragraph].append(word_offset)
      word_offset += 1
  return paragraph_dict

In [None]:
#performs the $ w word query. Print what paragraphs they are in and where if it is in the document
#param word is the word being queried
#param word_query_count is the dictionary that holds how much a word is searched
def word_query(word, word_query_count):
  global DICTIONARY
  #if word is not in the word_query_count list, add it
  if word not in word_query_count.keys():
      word_query_count[word] = 0
  #find how much word has been queried and print the val
  word_query_count[word] += 1
  print("Word has been queried " + str(word_query_count[word]) + " times")

  #if word is not in dict keys, print not found and exit
  if word not in DICTIONARY.keys():
    print("Word not found")
    print("\n")
    return
  else:
    print("The word has been found! " + word + " was found in the following paragraphs and offsets:")
    for para in DICTIONARY[word]:
      for offset in DICTIONARY[word][para]:
        print("Paragraph " + str(para) + " at offset " + str(offset))
  print("\n")

In [None]:
#performs the # j paragraph query. Print the first, last, and random word in the paragraph with 
#param para_num is the paragraph number performed by the query
#param para_num_list is the list of paragraph numbers
def para_query(para_num, para_num_list, paragraph_dict):
  #If paranum but from preprocessed list
  if (int(para_num) not in para_num_list) or (int(para_num) not in paragraph_dict.keys()):
    print("Paragraph not in provided list")
    print("\n")
    return

  word_list = paragraph_dict[int(para_num)]
  #gets random word value from wordlist value
  r = rand.randint(1, len(word_list) - 2)
  first_word = word_list[0]
  last_word = word_list[len(word_list) - 1]
  random_word = word_list[r]
  print(str(len(word_list)) + ", 1: '" + first_word + "', " + str(len(word_list)) + ": '" + last_word + "',  " + str(r) + ": '" + random_word + "'\n")

In [None]:
#creates the word_query_count dictionary for each word
#returns word_query_count dictionary all initiated to 0
def init_word_query_count():
  global DICTIONARY
  keys = DICTIONARY.keys()
  word_query_count = {}
  for item in keys:
    word_query_count[item] = 0
  return word_query_count

In [None]:
#will initiate the analysis and fill the global dictionary / dictionaries.
#param file_name is the file name
#param para_num_list is the list of paragraph numbers being analyzed
def analyze(file_name, para_num_list):
  global DICTIONARY
  #grab para stream
  para_stream = gen_read_para(file_name)
  #create global dictionary
  paragraph_dict = analyze_paras(para_stream, para_num_list)
  word_query_count = init_word_query_count()
  #grab what the user wants to compute
  entry = input('Enter analysis sequence: ')
  while entry != '/':
    tmp = entry.split()

    #do a word query
    if tmp[0] == '$':
      word_query(tmp[1], word_query_count)
    #do a paragraph query
    elif tmp[0] == '#':
      if tmp[1].isdigit():
        para_query(tmp[1], para_num_list, paragraph_dict)
      else:
        print("Paragraph needs to be given in number term")
    else:
      print("Improper input")

    #cleanup for next data entry
    tmp.clear()
    entry = input('Enter analysis sequence: ')
  DICTIONARY = {}

In [None]:
#cleans up the paragraph number list. Sorts it, removes duplicates
#param para_num_list is user given list
#returns a cleaned up para_num_list
def clean_para_list(para_num_list):
  #make a dict from keys then put back into lists
  de_dup = list(dict.fromkeys(para_num_list))
  #sort it
  de_dup.sort()
  return de_dup

In [None]:
def main():
  para_num_list = []
  filename = input('Enter the filename to be analyzed: ')
  
  #while the last line is not empty, keep getting paragraph numbers
  entry = input('Enter paragraph numbers: ')
  while entry != '':
    tmp = entry.split()
    for i in range(len(tmp)):
      if tmp[i].isdigit():
        para_num_list.append(int(tmp[i]))
    entry = input('Enter paragraph numbers:')
  
  #clean up the paragraph list
  para_num_list = clean_para_list(para_num_list)
  #analyze
  analyze(filename, para_num_list)
  print("Terminated")

In [None]:
main()

Enter the filename to be analyzed: textfile2.txt
Enter paragraph numbers: 1
Enter paragraph numbers:
Enter analysis sequence: $ this
Word has been queried 1 times
The word has been found! this was found in the following paragraphs and offsets:
Paragraph 1 at offset 7
Paragraph 1 at offset 36


Enter analysis sequence: /
Terminated
