# Introduction
This module is design to do the postprocessing for OWR.

Document (PDF/doc/docx) reading, and word extracting can be done as letters appears in document with upper or lower cases. However, OWR CRNN model design to return only upper cases hence postprocessing module will be design for upper cases.  


In [1]:
# Import Libraries
import numpy as np
import nltk
nltk.download('punkt')
from nltk import sent_tokenize
from nltk import word_tokenize
import re
from collections import Counter

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
def get_courseCode(text):
  '''
  This function is designed to extract Course Code
  Input: Page text
  Output: Course Code, Position for Level and Credit
  Logic:  1. Track Level and Credit positions in List
          2. Reverse search for
                a. Text with length 6
                b. Start with 2 characters
                c. End with 2 numeric
  '''
  # Remove spaces between numeric values
  text = re.sub(r'(\d) (\d)', r'\1\2', text)
  # extract words
  words = word_tokenize(text)

  # search string
  search_str = ['LEVEL', 'CREDITS']
  word_gap = 3

  # find the index of the search string
  search_pos = []
  for i, item in enumerate(words):
    if item.upper() == search_str[0].upper():
      search_pos.append(i)
    if item.upper() == search_str[1].upper() and len(search_pos) > 0:
      search_pos.append(i)

    if len(search_pos) > 1:
      if search_pos[1] - search_pos[0] < word_gap:
        break
      else: search_pos = []
    elif len(search_pos) > 0 and (i - search_pos[0]) > word_gap:
      search_pos = []

  # regular expression patterns for code
  pattern1 = r"\b\w{6}\b" # word with 6 positions
  pattern2 = r"\b^[a-zA-Z]{2}\w+" # start with 2 characters
  pattern3 = r"\w+\d{2}$" # end with 2 numbers

  code = None
  if len(search_pos) > 0:
    for i in range(search_pos[0], -1, -1):
      matches1 = re.findall(pattern1, words[i])
      matches2 = re.findall(pattern2, words[i])
      matches3 = re.findall(pattern3, words[i])

      if len(matches1)> 0 and len(matches2)> 0 and len(matches3)> 0:
        code = words[i]
        break

  return (code, search_pos)

In [3]:
def get_courseTitle(text, code, pos):
  '''
  This function is designed to return Course Tile
  Prerequisite: get_courseCode
  Input: Page text ,  Course Code and Position of Level and Credit
  Output: Course Title
  Logic: 1. Start position based on Course Code
         2. End position based on Level
  Both above information are retrieved from prerequisite function
  '''
  # Remove spaces between numeric values
  text = re.sub(r'(\d) (\d)', r'\1\2', text)
  # extract words
  words = word_tokenize(text)

  try:
    Index_Code = words.index(code)
  except: Index_Code = 0

  if len(pos) == 0:
    pos.append(0)

  title = []
  if Index_Code < pos[0]:
    for i in range(Index_Code +1, pos[0]):
      title.append(words[i])

  if len(title) > 0:
    return ' '.join(title, )
  else: return None

In [4]:
def get_courseLevel(text, pos):
  '''
  '''
  #search text
  end_search = 'AIM'
  pattern = r"\b^\d{1,2}" # start with 2 numbers

  # extract words
  words = word_tokenize(text)

  level = 0
  for i in range(pos[0], len(words)):

    matches = re.findall(pattern, words[i])

    if len(matches) > 0:
      try:
        level = int(words[i])
        break
      except ValueError:
        level = 0

    if end_search.upper() == words[i].upper():
      break

  return level

In [5]:
def get_owr_courseLevelAndCredid(wordList):
  start_search = ('LEVEL', 'CREDITS')
  end_search = ('AIM', 'AIMS')

  pattern = r"\b^\d{1,2}" # start with 2 numbers
  end_loop = False
  level = 0
  credit = 0
  for i in range(len(wordList)):
    words = word_tokenize(wordList[i])


    level_pos = -1
    credits_pos = -1
    for j, item in enumerate(words):
      if item.upper() == start_search[0].upper():
        level_pos = j
      elif item.upper() == start_search[1].upper():
        credits_pos = j
      if item.upper() in end_search:
        end_loop = True
        break


    if level_pos != -1 and credits_pos != -1:
      level_start = level_pos + 1
      level_end = credits_pos
      credits_start = credits_pos + 1
      credits_end = len(words)
    elif level_pos != -1 and credits_pos == -1:
      level_start = level_pos + 1
      level_end = len(words)
      credits_start = -1
      credits_end = -1
    elif level_pos == -1 and credits_pos != -1:
      level_start = -1
      level_end = -1
      credits_start = credits_pos + 1
      credits_end = len(words)
    else :
      level_start = -1
      level_end = -1
      credits_start = -1
      credits_end = -1


    # extract level and credits
    for pos in range(level_start, level_end):
      number = words[pos].replace('S','5')
      number = number.replace('A','4')
      number = number.replace('L','1')
      number = number.replace('O','0')
      number = number.replace('I','1')
      number = number.replace('Z','2')
      number = number.replace('E','3')
      number = number.replace('B','3')

      if len(re.findall(pattern, number)) > 0:
        try:
          level = int(number)
          break
        except:
          None

    for pos in range(credits_start, credits_end):
      number = words[pos].replace('S','5')
      number = number.replace('A','4')
      number = number.replace('L','1')
      number = number.replace('O','0')
      number = number.replace('I','1')
      number = number.replace('Z','2')
      number = number.replace('E','3')
      number = number.replace('B','3')

      if len(re.findall(pattern, number)) > 0:
        try:
          credit = int(number)
          break
        except:
          None

    if end_loop :
      break


  return level, credit

In [6]:
def get_courseCredit(text, pos):
  #search text
  end_search = 'AIM'
  pattern = r"\b^\d{1,2}" # start with 2 numbers

  # extract words
  words = word_tokenize(text)

  credit = 0
  for i in range(pos[1], len(words)):

    matches = re.findall(pattern, words[i])

    if len(matches) > 0:
      try:
        credit = int(words[i])
        break
      except ValueError:
        credit = 0

    if end_search.upper() == words[i].upper():
      break
  return credit

In [7]:
def get_courseTutorHrs(text):

  # search text
  search_text = ('LEARNING', 'TUTOR', 'TUTOR-DIRECTED', 'TUTORDIRECTED')
  stop_text   = ('AIM', 'AIMS')
  pattern = r"\b^\d{1,2}" # start with 2 numbers

  # extract words
  words = word_tokenize(text)

  search_pos = []
  for i, item in enumerate(words):
    if item.upper() in search_text:
      search_pos.append(i)

    if item.upper() in stop_text:
      break

  tutor_hrs = 0
  if len(search_pos) > 0:
    search_pos.sort(reverse=True)

    for i in range(search_pos[0], len(words)):
      matches = re.findall(pattern, words[i])

      if len(matches) > 0:
        try:
          tutor_hrs = int(words[i])
          break
        except ValueError:
          tutor_hrs = 0

      if words[i].upper() ==  stop_text:
        break

  return tutor_hrs

In [8]:
def get_courseSelfHrs(text):

  # search text
  search_text = ('LEARNING', 'SELF', 'SELF-DIRECTED', 'SELFDIRECTED')
  stop_text   = ('AIM', 'AIMS')
  pattern = r"\b^\d{1,2}" # start with 2 numbers

  # extract words
  words = word_tokenize(text)

  search_pos = []
  for i, item in enumerate(words):
    if item.upper() in search_text:
      search_pos.append(i)

    if item.upper() in stop_text:
      break

  self_hrs = 0
  if len(search_pos) > 0:
    search_pos.sort(reverse=True)

    for i in range(search_pos[0], len(words)):
      matches = re.findall(pattern, words[i])

      if len(matches) > 0:
        try:
          self_hrs = int(words[i])
          break
        except ValueError:
          self_hrs = 0

      if words[i].upper() ==  stop_text:
        break

    if self_hrs == 0:
      for i in range(search_pos[0]-1, len(words)):
        matches = re.findall(pattern, words[i])

        if len(matches) > 0:
          try:
            self_hrs = int(words[i])
            break
          except ValueError:
            self_hrs = 0

        if words[i].upper() ==  stop_text:
          break

  return self_hrs

In [9]:
def get_owr_courseTutorAndSelfHrs(wordList):
  tutor_direct_search = ('LEARNING', 'TUTOR', 'TUTOR-DIRECTED', 'TUTORDIRECTED')
  self_direct_search = ('LEARNING', 'SELF', 'SELF-DIRECTED', 'SELFDIRECTED')
  end_search = ('AIM', 'AIMS')

  pattern = r"\b^\d{1,2}" # start with 2 numbers

  end_loop = False
  tutor_hours = 0
  self_hours = 0
  for i in range(len(wordList)):

    words = word_tokenize(wordList[i])

    tutor_found = False
    self_found = False
    for j, item in enumerate(words):
      if item.upper() in tutor_direct_search:
        tutor_found = True
      elif item.upper() in self_direct_search:
        self_found = True
      if item.upper() in end_search:
        end_loop = True
        break

    if tutor_found:
      for pos in range(len(words) -1, -1, -1):
        number = words[pos].replace('S','5')
        number = number.replace('A','4')
        number = number.replace('L','1')
        number = number.replace('O','0')
        number = number.replace('I','1')
        number = number.replace('Z','2')
        number = number.replace('E','3')
        number = number.replace('B','3')

        if len(re.findall(pattern, number)) > 0:
          try:
            tutor_hours = int(number)
            break
          except:
            None
    if self_found:
      for pos in range(len(words) -1, -1, -1):
        number = words[pos].replace('S','5')
        number = number.replace('A','4')
        number = number.replace('L','1')
        number = number.replace('O','0')
        number = number.replace('I','1')
        number = number.replace('Z','2')
        number = number.replace('E','3')
        number = number.replace('B','3')


        if len(re.findall(pattern, number)) > 0:
          try:
            self_hours = int(number)
            break
          except:
            None

    if end_loop :
      break


  return tutor_hours, self_hours


In [10]:
def get_endPoint4Exceptions(sentence):
  end_text_exception = ['LEARNING OUTCO MES', 'LEAR NING OUTCOMES',
                        'IDENTIFY AND EXPLAIN CONTEMPORARY', 'CRITICALLY ANALYSE ETHICAL ISSUES']


  count = 0
  positions = []
  for search_word in end_text_exception:
    pos = sentence.upper().find(search_word)
    if pos != -1:
      count += 1
      positions.append(pos)

    if count > 0:
      break

  if count > 0:
    return positions[0]
  else: return len(sentence)

def get_courseAim(text):

  # search text
  search_text  = 'AIM'
  end_text = 'LEARNING OUTCOMES'

  # extract sentence
  sentences = sent_tokenize(text)

  # get 1st sentence with 'aim' word
  aim = ''
  pos = -1
  end_pos = -1
  start_flag = False
  stop_flag = False
  for sentance in sentences:
    if not start_flag:
      pos = sentance.upper().find(search_text)
      if pos != -1:
        start_flag = True
        temp_pos = sentance[pos:].find('\n')
        if temp_pos != -1 and temp_pos < 6:
          pos += temp_pos

    if start_flag:
      temp_pos = sentance.upper().find(end_text)
      if temp_pos != -1:
        end_pos = temp_pos
        stop_flag = True
      else:
        end_pos = get_endPoint4Exceptions(sentance)
        if end_pos != len(sentance):
          stop_flag = True

      aim += sentance[pos:end_pos] + '\n'

      pos = 0
      if stop_flag:
        break

  aim = aim.lstrip("\n")
  aim = aim.rstrip("\n")

  return aim

In [11]:
def get_owr_aim(wordList):

  search_start = ('AIM', 'AIMS')
  search_end = ('LEARNING OUTCOMES', 'LEARNING OUTCO MES', 'LEAR NING OUTCOMES',
                'IDENTIFY AND EXPLAIN CONTEMPORARY', 'CRITICALLY ANALYSE ETHICAL ISSUES')
  start_pos =  -1
  end_pos = -1
  for i in range(len(wordList)):
    if wordList[i].upper() in search_start:
      start_pos =  i
    elif wordList[i].upper() in search_end:
      end_pos = i

    if start_pos != -1 and end_pos != -1:
      break

  aim = ' '.join(wordList[start_pos + 1:end_pos])

  return aim


In [12]:
def get_coursePrerequisite(text, course_credit, tutor_directed):

  ignore_text = ('LEARNING', 'HOURS', 'TUTOR', 'TUTOR-DIRECTED', 'TUTORDIRECTED',
                 'DIRECTED', '-DIRECTED', 'PRE-REQUISITES', '-REQUISITES',
                 'REQUISITES', 'PREREQUISITES', 'NONE', str(tutor_directed))

  stop_text = ('PRE-REQUISITES', '-REQUISITES', 'REQUISITES', 'PREREQUISITES',
               str(course_credit), 'CREDITS')


  # regular expression patterns for code
  pattern1 = r"\b\w{6}\b" # word with 6 positions
  pattern2 = r"\b^[a-zA-Z]{2}\w+" # start with 2 characters
  pattern3 = r"\w+\d{2}$" # end with 2 numbers

  # extract words
  words = word_tokenize(text)

  start_pos = 0
  for i, item in enumerate(words):
    if item.upper() == str(tutor_directed):
      start_pos = i
      break

  code = []
  title = []
  code_flag = False
  output = []
  for i in range(i,0,-1):

    if words[i].upper() in stop_text:
      break

    if not ( words[i].upper() in ignore_text):
      match_num = re.findall(pattern3, words[i])
      if len(match_num) > 0:
        code.append(words[i])
      elif code_flag:  code.append(words[i])
      else: title.append(words[i])

    if len(code) > 0:
      tempCode = code.copy()
      tempCode.reverse()
      course_code = ''.join(tempCode)

      if len(course_code) < 6:
        code_flag = True

      matches1 = re.findall(pattern1, course_code)
      matches2 = re.findall(pattern2, course_code)
      matches3 = re.findall(pattern3, course_code)

      if len(matches1)> 0 and len(matches2)> 0 and len(matches3)> 0:
        tempTitle = title.copy()
        tempTitle.reverse()
        output.append([course_code, ' '.join(tempTitle)])
        code = []
        title = []
        code_flag = False


  return output

In [13]:
def get_owr_prerequisite(wordList):

  output = []

  search_start = ('PRE-REQUISITES', '-REQUISITES', 'REQUISITES', 'PREREQUISITES')
  search_end = ('LEARNING', 'HOURS', 'TUTOR', 'TUTOR-DIRECTED', 'TUTORDIRECTED',
                'DIRECTED', '-DIRECTED')

  start_pos = -1
  start_sub_pos = -1
  end_pos = -1
  for i in range(len(wordList)):
    if wordList[i].upper() in search_start:
      start_pos =  i + 1
      start_sub_pos = 0
    elif wordList[i].upper() in search_end:
      end_pos = i

    if start_pos == -1 or end_pos == -1 :
      words = word_tokenize(wordList[i])

      for j in range(len(words)):
        if words[j].upper() in search_start:
          start_pos =  i
          start_sub_pos = j + 1
        if words[j].upper() in search_end:
          end_pos = i
        if start_pos != -1 and end_pos != -1:
          break

    if start_pos != -1 and end_pos != -1:
      break
  if start_pos != -1 and end_pos != -1:
    for i in range(start_pos, end_pos):
      words = word_tokenize(wordList[i])
      if i == start_pos:
        start = start_sub_pos
      else: start = 0

      title = []
      courseCode = ''
      tmpJ = -1
      for j in range(start, len(words)):
        if j == tmpJ:
          continue

        if j == start:

          if words[j] in ('NONE', 'NIL', 'LIL'):
            break

          courseCode = words[j]
          tmpJ = -1
          if len(words[j]) < 4 and (len(words) > j -1):
            if j+1 < len(words):
              courseCode = words[j] + words[j+1]
              tmpJ = j + 1
          if len(courseCode) > 10:
            courseCode = words[j]
            tmpJ = -1

          courseCode = courseCode[0:2] + courseCode[2:].replace('S','5')
          courseCode = courseCode[0:2] + courseCode[2:].replace('A','4')
          courseCode = courseCode[0:2] + courseCode[2:].replace('L','1')
          courseCode = courseCode[0:2] + courseCode[2:].replace('O','0')
          courseCode = courseCode[0:2] + courseCode[2:].replace('I','1')
          courseCode = courseCode[0:2] + courseCode[2:].replace('Z','2')
          courseCode = courseCode[0:2] + courseCode[2:].replace('E','3')
          courseCode = courseCode[0:2] + courseCode[2:].replace('B','3')
        else: title.append(words[j])
      if len(courseCode) > 0:
        output.append([courseCode, ' '.join(title)])


  return output

In [14]:
def get_courseLearningOutcome(text):

  # search text
  search_text = ('LEARNING', 'OUTCOMES', 'OUTCO', 'MES', 'LEAR', 'NING')
  stop_text   = ('INDICATIVE', 'CONTENT', 'CONTE', 'NT', '•', 'ASSESSMENTS' ,
                 'METHOD',  'WEIGHTING',  'WEIGHT', 'OUTCOME/S', 'OUTCOMES')

  # regular expression patterns for code
  pattern = r"\d{1}$" # end with 1 numbers

  # extract words
  words = word_tokenize(text)

  start_pos = []
  end_pos = []
  for i, item in enumerate(words):
    if item.upper() in search_text and words[i+1].upper() in search_text:
      start_pos.append(i+1)
      break

  for i, item in enumerate(words):
    if item.upper() in stop_text and i > max(start_pos, default=0):
      end_pos.append(i)
      break


  learning_outcome = []
  index = -1
  outcome = []
  start_flag = False
  for i in range(start_pos[0], end_pos[0]):
    if (len(words[i]) == 1) and (len(re.findall(pattern, words[i])) > 0):
      if index != -1:
        learning_outcome.append([index, ' '.join(outcome)])
        outcome = []
      index = int( words[i])
      start_flag = True
      continue
    elif start_flag:
      if not ((len(words[i-1]) == 1) and (len(re.findall(pattern, words[i-1])) > 0) and words[i] == '.') :
        outcome.append(words[i])
  if index != -1:
    learning_outcome.append([index, ' '.join(outcome)])

  return learning_outcome



In [15]:
def get_owr_courseLearningOutcome(wordList):

  output = []

  search_text = ('LEARNING OUTCOMES', 'LEARNING OUTCO MES', 'LEAR NING OUTCOMES', 'LEARNING OUTCONES', 'OIES')
  stop_text   = ('INDICATIVE', 'CONTENT', 'CONTE', 'NT', '•', 'ASSESSMENTS' ,
                 'METHOD',  'WEIGHTING',  'WEIGHT', 'OUTCOME/S', 'OUTCOMES')

  start_pos = []
  end_pos = []
  for i in range(len(wordList)):
    if wordList[i].upper() in search_text:
      start_pos.append(i + 2)

    words = word_tokenize(wordList[i])
    for j in range(len(words)):
      if words[j].upper() in stop_text and len(start_pos)> 0 and i >  max(start_pos, default=0):
        end_pos.append(i)
        break

    if len(start_pos) > 0 and len(end_pos) > 0:
      break

  index = 0
  learningOutcome = []
  if len(start_pos) > 0: max_start_pos = max(start_pos)
  else: max_start_pos = 0
  if len(end_pos) > 0: max_end_pos = max(end_pos)
  else: max_end_pos = 0
  for i in range(max_start_pos, max_end_pos):
    words = word_tokenize(wordList[i])

    for j in range(len(words)):
      if len(words[j]) <= 2 and j == 0:
        index += 1
        if index == 1:
          continue
        else:
          output.append([index - 1, ' '.join(learningOutcome)])
          learningOutcome = []
      else:
        learningOutcome.append(words[j])

  output.append([index, ' '.join(learningOutcome)])


  return output

In [16]:
def get_courseAssessments(text):

  method = []
  weight = []
  learning = []

  SpecialChar = '-'

  # search text

  text_array = text.splitlines()

  search_text = ('ASSESSMENTS' ,  'METHOD',  'WEIGHTING',  'WEIGHT', 'LEARNING',  'OUTCOME/S', 'OUTCOMES')
  end_text = ('SUCCESSFUL', 'COMPLETION', 'OF', 'COURSE')

  start_pos = []
  end_pos = []

  for i, item in enumerate(text_array):

    words = word_tokenize(item)
    for word in words:
      if word.upper() in search_text:
        if max(start_pos, default = 0) + 3 > i:
          start_pos.append(i)
        else:
          start_pos.clear()
          start_pos.append(i)

      if word.upper() in end_text:
        if max(end_pos, default = 0) + 2 > i and len(start_pos) > 2:
          end_pos.append(i)
        else:
          end_pos.clear()
          end_pos.append(i)

      if len(end_pos) > 3:
        break
    if len(end_pos) > 3:
      break

  assess_start = max(start_pos, default = 0) + 1
  assess_end = max(end_pos, default = 0)

  for i in range(assess_start, assess_end):
    words = word_tokenize(text_array[i].replace(" -", "-").replace("- ", "-").replace(","," , "))
    try:
      percentagePos =  words.index('%')
    except:
      continue

    method.append(' '.join(words[:percentagePos - 1]))
    weight.append(words[percentagePos -1])

    learningOutcome = []
    for word in words[percentagePos +1:]:
      if word.upper() == 'ALL':
        learningOutcome.append('99')
      elif SpecialChar in word:
        tmpWord = word.split(SpecialChar)
        for i in range(int(tmpWord[0]), int(tmpWord[1]) +1 ):
          learningOutcome.append(str(i))
      elif word.isdigit():
        learningOutcome.append(word)
    learning.append(learningOutcome)


  return   (method, weight, learning)

In [17]:
def get_courseCompletion(text):

  output = []
  specialChar = '•'

  # search text
  text_array = text.splitlines()

  search_text = ('ASSESSMENTS' ,  'METHOD',  'WEIGHTING',  'WEIGHT', 'LEARNING',  'OUTCOME/S', 'OUTCOMES')
  search_text2 = ('SUCCESSFUL', 'COMPLETION', 'OF', 'COURSE')
  end_text = ('RESOURCES')

  start_pos = []
  mid_pos = []
  end_pos = []

  for i, item in enumerate(text_array):

    words = word_tokenize(item)
    for word in words:
      if word.upper() in search_text:
        if max(start_pos, default = 0) + 3 > i:
          start_pos.append(i)
        else:
          start_pos.clear()
          start_pos.append(i)

      if word.upper() in search_text2:
        if not(len(mid_pos) > 3):
          if max(mid_pos, default = 0) + 2 > i and len(start_pos) > 2:
            mid_pos.append(i)
          else:
            mid_pos.clear()
            mid_pos.append(i)

      if word.upper() in end_text:
        if len(mid_pos) > 2:
          end_pos.append(i)


      if len(mid_pos) > 3 and len(end_pos) > 0 :
        break
    if len(mid_pos) > 3 and len(end_pos) > 0:
      break

  if len(end_pos) == 0:
    end_pos.append(len(text_array))

  assess_start = max(mid_pos, default = 0) + 1
  assess_end = max(end_pos, default = 0)

  outtext = []
  for sentance in text_array[assess_start:assess_end]:
    words = word_tokenize(sentance)
    for i in range(len(words)):
      if specialChar in words[i] and  i == 0 and len(outtext) > 0:
        output.append(' '.join(outtext))
        outtext.clear()
      else :
        tmpStr = words[i]
        tmpStr = tmpStr.strip()
        if len(tmpStr) > 0:outtext.append(tmpStr)
  output.append(' '.join(outtext))

  return output

In [18]:
def most_common_value(lst):
    counter = Counter(lst)
    max_value = max(counter, key=counter.get, default = 0)
    return max_value

In [19]:
def get_owr_courseCompletion(text):
  text_array = text

  output = []
  specialChar = 'IP'

  # search text
  #text_array = text.splitlines()

  search_text = ('ASSESSMENTS' , 'METHOD',  'WEIGHTING',  'WEIGHT', 'LEARNING',  'OUTCOME/S', 'OUTCOMES')
  search_text2 = ('SUCCESSFUL', 'COMPLETION', 'OF', 'COURSE')
  end_text = ('RESOURCES', 'NELY', 'NEW', 'ZEALAND', 'CERTIFICATE', 'INFORMATION', 'TECHNOLOGY', 'ESSENTIALS')

  start_pos = []
  mid_pos = []
  end_pos = []

  for i, item in enumerate(text_array):



    if item.upper() in search_text:
      if max(start_pos, default = 0) + 2 > i:
        start_pos.append(i)
      else:
        start_pos.clear()
        start_pos.append(i)

    words = word_tokenize(item)
    for word in words:

      if len(start_pos) < 1 and word.upper() in search_text:
        if max(start_pos, default = 0) + 2 > i:
          start_pos.append(i)
        else:
          start_pos.clear()
          start_pos.append(i)

      if word.upper() in search_text2:
        if not(len(mid_pos) > 3):
          if max(mid_pos, default = 0) + 2 > i and len(start_pos) > 0:
            mid_pos.append(i)
          else:
            mid_pos.clear()
            mid_pos.append(i)

      if word.upper() in end_text:
        if len(mid_pos) > 2:
          end_pos.append(i)


      if len(mid_pos) > 3 and len(end_pos) > 0 :
        break
    if len(mid_pos) > 3 and len(end_pos) > 0:
      break

  if len(end_pos) == 0:
    end_pos.append(len(text_array))

  assess_start = most_common_value(mid_pos) + 1
  assess_end = max(end_pos, default = 0)

  outtext = []
  for sentance in text_array[assess_start:assess_end]:
    words = word_tokenize(sentance)
    for i in range(len(words)):
      if specialChar in words[i] and  i == 0 and len(outtext) > 0:
        output.append(' '.join(outtext))
        outtext.clear()
      else :
        tmpStr = words[i]
        tmpStr = tmpStr.strip()
        if len(tmpStr) > 0:outtext.append(tmpStr)
  output.append(' '.join(outtext))

  return output