# Installations

In [1]:
!pip install --upgrade spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!python -m spacy info

2023-05-09 23:51:47.666237: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[1m

spaCy version    3.5.2                         
Location         /usr/local/lib/python3.10/dist-packages/spacy
Platform         Linux-5.10.147+-x86_64-with-glibc2.31
Python version   3.10.11                       
Pipelines        en_core_web_sm (3.5.0)        



In [3]:
import spacy
print(spacy.__version__)

3.5.2


In [4]:
!python -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [5]:
!pip install stanfordcorenlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stanfordcorenlp
  Downloading stanfordcorenlp-3.9.1.1-py2.py3-none-any.whl (5.7 kB)
Installing collected packages: stanfordcorenlp
Successfully installed stanfordcorenlp-3.9.1.1


In [6]:
!pip install language-tool-python

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting language-tool-python
  Downloading language_tool_python-2.7.1-py3-none-any.whl (34 kB)
Installing collected packages: language-tool-python
Successfully installed language-tool-python-2.7.1


# Imports

In [7]:
import re
import spacy
from spacy.matcher import Matcher
import uuid
import os
import json
from stanfordcorenlp import StanfordCoreNLP

In [8]:
# Download the Stanford CoreNLP model
!wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
!unzip stanford-corenlp-full-2018-10-05.zip

--2023-05-09 23:53:07--  http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip [following]
--2023-05-09 23:53:07--  https://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://downloads.cs.stanford.edu/nlp/software/stanford-corenlp-full-2018-10-05.zip [following]
--2023-05-09 23:53:08--  https://downloads.cs.stanford.edu/nlp/software/stanford-corenlp-full-2018-10-05.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... conn

# User Stories Generation Tool


### Helper Function

In [73]:
# The scentence can consist of:
# Subject (nsubj) - Verb - Object (dobj), example: [The dog chased the cat] "Active Sentence"
# Object (nsubjpass) - Verb - Subject (pobj), example: [The book was written by the author] "Passive Sentence"

def get_subject_verb_object_from_sentence(token, subject_list, verb_list, object_list):
  # If the token is a subject, add it to the subject_list
  if(token.dep_ == "nsubj"):
    subject = token
    subject_list.append(subject)

  # If the token is a prepositional object, add it to the subject_list
  elif(token.dep_ == "pobj"):
    subject = token
    subject_list.append(subject)

  # If the token is a direct object, add it to the object_list
  elif(token.dep_ == "dobj"):
    object = token
    object_list.append(object)

  # If the token is a passive subject, add it to the object_list
  elif(token.dep_ == "nsubjpass"):
    object = token
    object_list.append(object)

  # If the token is a verb, add it to the verb_list
  elif(token.pos_ == "VERB"):
    verb = token
    verb_list.append(verb)

In [74]:
def create_subject_verb_object_list(doc, sentence, subject_list, verb_list, object_list, subject_verb_object_list):
  # Check if the sentence is passive
  passive_sentence = False
  for object in object_list:
    if(object.dep_ == "nsubjpass"):
      passive_sentence = True
  if(passive_sentence == False):
    for subject in subject_list:
      if(subject.dep_ == "pobj"):
        subject_list.remove(subject)


  # Step #1: iterate over each verb in the verb list
  for verb in verb_list:
    subject_verb_object_dictionary = {"subject": "a user", "verb": verb, "object": "", "isNegative_sentence": False}

    # Iterate over the children of the verb 
    for child in verb.children:
      # if the child is a negation, set the isNegative flag in the dictionary to True
      if(child.dep_ == "neg"):
        subject_verb_object_dictionary["isNegative_sentence"] = True

    # verb_subtree_list = list(verb.subtree)
    # verb_index = verb_subtree_list.index(verb)

    # get the index of the first word in the sentence 
    first_word_in_sentence_index_in_doc = sentence[0].i
    # get the index of the last word in the sentence   
    last_word_in_sentence_index = sentence[-1].i 
    # get the index of the verb in the doc
    verb_index_in_doc = verb.i


    # The sentence is passive
    if(passive_sentence == True):
      #print("The sentence is passive")
      verb_ancestor = ""
      # Extracting the subject of the passive sentence
      for token in doc[verb_index_in_doc:last_word_in_sentence_index+1]:
        if(token in subject_list and token.dep_ == "pobj" and ((list(token.ancestors))[0]).lemma_ == "by"):
          token_ancestors_list = list(token.ancestors)
          for ancestor in token_ancestors_list:
            if(ancestor.pos_ == "VERB"):
              verb_ancestor = ancestor
              break
          if(str(verb_ancestor) != ""): 
            if(verb_ancestor == verb):
              subject_verb_object_dictionary["subject"] = token
              break
            elif(verb in list(verb_ancestor.conjuncts)):
              subject_verb_object_dictionary["subject"] = token
              break 
            else:
              break

      # Extracting the object of the passive sentence
      temp_reversed_list = list(doc[first_word_in_sentence_index_in_doc:verb_index_in_doc+1])
      for token in temp_reversed_list[::-1]:
        if(token in object_list and token.dep_ == "nsubjpass"):
          token_ancestors_list = list(token.ancestors)
          for ancestor in token_ancestors_list:
            if(ancestor.pos_ == "VERB"):
              verb_ancestor = ancestor
              break
          if(str(verb_ancestor) != ""): 
            if(verb_ancestor == verb):
              subject_verb_object_dictionary["object"] = token
              break
            elif(verb in list(verb_ancestor.conjuncts)):
              subject_verb_object_dictionary["object"] = token
              break 
            else:
              break

    # The sentence is active
    elif(passive_sentence == False):
      #print("The sentence is active")
      verb_ancestor = ""

      # Extracting the object of the active sentence  
      for token in doc[verb_index_in_doc:last_word_in_sentence_index+1]:
        if(token in object_list and token.dep_ == "dobj"):
          token_ancestors_list = list(token.ancestors)
          for ancestor in token_ancestors_list:
            if(ancestor.pos_ == "VERB"):
              verb_ancestor = ancestor
              break
          if(str(verb_ancestor) != ""): 
            if(verb_ancestor == verb):
              subject_verb_object_dictionary["object"] = token
              break
            elif(verb in list(verb_ancestor.conjuncts)):
              subject_verb_object_dictionary["object"] = token
              break           
            else:
              break

      # Extracting the subject of the active sentence 
      be_ancestor = False   
      temp_reversed_list = list(doc[first_word_in_sentence_index_in_doc:verb_index_in_doc+1])
      for token in temp_reversed_list[::-1]:
        if(token in subject_list and token.dep_ == "nsubj"):
          token_ancestors_list = list(token.ancestors)
          for ancestor in token_ancestors_list:
            if(ancestor.pos_ == "VERB"):
              verb_ancestor = ancestor
              break
            elif(ancestor.lemma_ == "be"):
              be_ancestor = True
              break  
          if(str(verb_ancestor) != ""):            
            if(verb_ancestor == verb):
              subject_verb_object_dictionary["subject"] = token
              break
            elif(be_ancestor == True):
              subject_verb_object_dictionary["subject"] = token
              break    
            elif(verb in list(verb_ancestor.conjuncts)):
              subject_verb_object_dictionary["subject"] = token
              break        
            else:
              break

    # If the sentence has an object then it is added to the subject_verb_object_list 
    if(str(subject_verb_object_dictionary["object"]) != ""):
      subject_verb_object_list.append(subject_verb_object_dictionary)

In [75]:
def set_object_pronouns_in_title(subject, object):
  # substitute "his" with the appropriate pronoun
  object = re.sub(r'\bhis\b', subject, object)
  # substitute "her" with the appropriate pronoun
  object = re.sub(r'\bher\b', subject, object)
  # substitute "their" with the appropriate pronoun
  object = re.sub(r'\btheir\b', subject, object)
  # remove extra spaces and parentheses
  object = object.replace(" ,", ",").replace("( ", "(").replace(" )", ")")
  # return the modified object
  return object

In [76]:
def remove_unclosed_brackets(sentence):
  open_bracket_index = sentence.find('(')
  # check if the unclosed bracket is found
  if open_bracket_index != -1:
    # search for the closing bracket
    close_bracket_index = sentence.find(')', open_bracket_index)  
    # check if a closing bracket is found
    if close_bracket_index != -1:
      # if a closing bracket is found, keep the original sentence
      corrected_sentence = sentence  
    else:
      # if no closing bracket is found, remove the unclosed bracket
      corrected_sentence = sentence[:open_bracket_index] + sentence[open_bracket_index+1:]  
  else:
    # if no unclosed bracket is found, keep the original sentence
    corrected_sentence = sentence

  return corrected_sentence  

In [77]:
def fix_unclosed_brackets(sentence):
  open_bracket_index = sentence.find('(')
  # check if the unclosed bracket is found
  if open_bracket_index != -1:
    # search for the closing bracket
    close_bracket_index = sentence.find(')', open_bracket_index)  
    # check if a closing bracket is found
    if close_bracket_index != -1:
      # if a closing bracket is found, keep the original sentence
      corrected_sentence = sentence  
    else:
      # if no closing bracket is found, add the missing bracket
      corrected_sentence = sentence + ")" 
  else:
    # if no unclosed bracket is found, keep the original sentence
    corrected_sentence = sentence

  return corrected_sentence  

In [78]:
def check_if_subject_incorrect(doc, sentence, subject, verb, subject_list):
  # get the index of the first word in the sentence
  first_word_in_sentence_index = sentence[0].i
  # get the index of the verb
  verb_index = verb.i
  # check if the subject is "a user"
  if(subject == "a user"):
    # look at each token from the first word in the sentence up to and including the verb
    for token in doc[first_word_in_sentence_index:verb_index + 1]:
      # check if the token is in the subject list, is a verb, and the next word is "to" and the word after that is the verb we're looking for
      if((token in subject_list) and (token.nbor().pos_ == "VERB") and (token.nbor().nbor().lemma_ == "to") and (token.nbor().nbor().nbor() == verb)):
        # if the conditions are met, set the subject to the text of the token
        subject = token.text
        # return the corrected subject
        return subject
  # if the subject doesn't need correction, return the original subject
  return subject

In [79]:
def set_subject_and_pronouns(subject, object):
  # set default pronoun to "I"
  pronoun = "I"

  subject = subject.lower()
  
  # check if subject is a specific pronoun and set subject and pronoun accordingly
  if(subject == "i" or subject == "he" or subject == "she" or subject == "you"):
    subject = "a user"
    pronoun = "I"
  elif(subject == "we" or subject == "they" or subject == "users"):
    subject = "users"
    pronoun = "We"

  # replace possessive pronouns with the correct form
  object = re.sub(r'\bhis\b', 'my', object)
  object = re.sub(r'\bher\b', 'my', object)
  object = re.sub(r'\btheir\b', 'our', object)
  
  # remove extra spaces and parentheses
  object = object.replace(" ,", ",").replace("( ", "(").replace(" )", ")")
  
  # return updated subject, pronoun, and object
  return subject, pronoun, object

In [80]:
def deal_with_both_in_subject_if_found(doc, sentence, subject):
  # check if "both" is found in the subject
  both_found_in_subject = False
  subject_list = subject.split()
  for word in subject_list:
    if(word.lower() == "both"):
      both_found_in_subject = True
  if(both_found_in_subject == True):
    # get the index of the first token in the sentence
    first_token_in_sentence_index = sentence[0].i
    # get the index of the token with lemma "both" in the doc
    for token in sentence:
      if(token.lemma_ == "both"):
        both_index_in_doc = token.i
    # find the conjuncts in the subject and replace "both" with the conjuncts
    for token in doc[first_token_in_sentence_index:both_index_in_doc]:
      if(token.dep_ == "conj"):
        # get the raw text of the conjuncts
        conjuncts_raw_tokens = []
        for item in token.conjuncts[0].subtree:
          conjuncts_raw_tokens.append(item.text)
        conjuncts_raw_text = " ".join(conjuncts_raw_tokens)
        # get the index of "both" in the subject
        both_index_in_subject = subject_list.index('both')
        # replace "both" with the conjuncts in the subject
        subject = f"{' '.join(subject_list[:both_index_in_subject + 1])} ({conjuncts_raw_text.lower()}) {' '.join(subject_list[both_index_in_subject + 1:])}"
  return subject


In [81]:
def deal_with_the_sentence_after_object(doc, sentence, subject, verb, object, original_object, subject_verb_object_list):
  # print("object: {}".format(object)) 
  # get the subtree of the original object
  original_object_subtree = list(original_object.subtree)
  # get the last item in the original object's subtree    
  last_item_in_object_subtree = original_object_subtree[-1]  
  # get the index of the first word in the sentence 
  first_word_in_sentence_index = sentence[0].i 
  # get the index of the last word in the sentence   
  last_word_in_sentence_index = sentence[-1].i    
  # iterate through the tokens in the sentence
  for token in doc[first_word_in_sentence_index:last_word_in_sentence_index+1]:  
    # if the current token matches the last item in the original object's subtree 
    if(token.text == last_item_in_object_subtree.text):
      if(token in list(verb.subtree)):    
        # set the index of the last word in the object to the index of the current token
        last_word_in_object_index = token.i
        break   

  # print(doc[last_word_in_object_index:last_word_in_sentence_index+1]) 
  for token in doc[last_word_in_object_index:last_word_in_sentence_index+1]:
    # check if the token is an ADP (preposition)
    if(token.pos_ == "ADP" and token.lemma_ != "to" and token.lemma_ != "so" and token.lemma_ != "that"):
      # get the subtree of the verb
      verb_subtree_of_object = list(verb.subtree)
      if(original_object in verb_subtree_of_object):
        # get the index of the original object in the verb subtree
        object_in_verb_subtree_index = verb_subtree_of_object.index(original_object)
        verb_subtree_of_object_list = verb_subtree_of_object[object_in_verb_subtree_index:]
      else:
        verb_subtree_of_object_list = verb_subtree_of_object
      
      # loop through the tokens in the verb subtree that come after the original object
      for adp_item in verb_subtree_of_object_list:
        # check if the token is an preposition
        if(adp_item.pos_ == "ADP"):
          # create a list of the raw tokens in the preposition subtree
          adp_item_raw_tokens = []
          for item in adp_item.subtree:
            adp_item_raw_tokens.append(item.text)
          # join the raw tokens to create the raw text of the preposition
          adp_item_raw_text = " ".join(adp_item_raw_tokens)
          # remove extra spaces and parentheses
          adp_item_raw_text = adp_item_raw_text.replace(" ,", ",").replace("( ", "(").replace(" )", ")")
          # loop through the ancestors of the preposition token to find the verb
          for item in list(adp_item.ancestors):
            if(item.pos_ == "VERB"):
              adp_verb = item
              break
          # check if the verb of the preposition is the same as the input verb
          if(verb == adp_verb):
            # check if the raw text of the preposition is not already in the object
            if(adp_item_raw_text.lower() not in object.lower()):
              # check if the subject is not in the raw text of the preposition
              if(subject.lower() not in adp_item_raw_text.lower()):
                # add the raw text of the preposition to the object
                # print("adp_item_raw_text: {}".format(adp_item_raw_text))
                object = f"{object} {adp_item_raw_text}"
                return object
            else:
              return object


    # Check if the token is the "to" lemma
    elif(token.lemma_ == "to"): 
      # print("token: {}".format(token)) 
      # Initialize a list to store the tokens of the "to" subtree
      to_sentence_raw_tokens = [] 
      # Loop over the subtree of the head of the "to" token
      for item in token.head.subtree: 
        # Append the text of each token to the list
        to_sentence_raw_tokens.append(item.text)
        # Join the list of tokens into a string
        to_sentence_raw_text = " ".join(to_sentence_raw_tokens)
      # Split the string into a list of words 
      to_sentence_raw_text_list = to_sentence_raw_text.split() 
      # Find the index of the "to" word in the list
      to_index = to_sentence_raw_text_list.index("to") 
      # Append the words after "to" to the object
      object = f"{object} {' '.join(to_sentence_raw_text_list[to_index:])}" 
      # Return the modified object string
      return object 

    # Check if the token is the conjunction "so" and it is not followed by "that"
    elif(token.lemma_ == "so" and token.nbor().text != "that"):  
      so_sentence_raw_tokens = []
      # Iterate over the subtree of the head of the "so" token
      for item in token.head.subtree:  
        so_sentence_raw_tokens.append(item.text)
        so_sentence_raw_text = " ".join(so_sentence_raw_tokens)
      # Convert the raw text to a list of words
      so_sentence_raw_text_list = so_sentence_raw_text.split()
      # Get the index of the "so" conjunction  
      so_index = so_sentence_raw_text_list.index("so")  
      # Append the words after the "so" conjunction to the object phrase
      object = f"{object} {' '.join(so_sentence_raw_text_list[so_index:])}"  
      # Return the modified object string      
      return object  


    elif(token.lemma_ == "that"):
      # create an empty list to store the raw tokens of the sentence after 'that'
      that_sentence_raw_tokens = []
      # iterate over the subtree of the head of the token
      for item in token.head.subtree:
        # append the text of each item in the subtree to the list of raw tokens
        that_sentence_raw_tokens.append(item.text)
        # join the list of raw tokens with spaces to create a string of the entire sentence
        that_sentence_raw_text = " ".join(that_sentence_raw_tokens)
      # split the string of the entire sentence into a list of individual words
      that_sentence_raw_text_list = that_sentence_raw_text.split()
      # find the index of 'that' in the list of individual words
      that_index = that_sentence_raw_text_list.index("that")
      # check if the word before 'that' is 'so'
      if(that_sentence_raw_text_list[that_index - 1].lower() == "so"):
        # if it is, add 'so' and the words after 'so' to the object
        object = f"{object} {' '.join(that_sentence_raw_text_list[that_index - 1:])}"
      else:
        # if it's not, add 'that' and the words after 'that' to the object
        object = f"{object} {' '.join(that_sentence_raw_text_list[that_index:])}"
      # return the updated object
      return object

      
    # If the current token is an opening bracket and its head word is in the object
    elif(token.lemma_ == "("):
      if(str(token.head) in object):
        # Get the index of the opening bracket
        open_bracket_index = token.i
        # Find the closing bracket in the rest of the sentence
        for token in doc[open_bracket_index:last_word_in_sentence_index+1]:
          if(token.lemma_ == ")"):
            closed_bracket_index = token.i
            break
        # Get the span of tokens between the brackets
        brackets_span = doc[open_bracket_index:closed_bracket_index+1]
        # Convert the span into a list of tokens and then into a sentence
        brackets_span_list = [t.text for t in brackets_span]
        brackets_span_sentence = ' '.join(brackets_span_list)
        # Get the span of tokens inside the brackets
        inside_brackets_span = doc[open_bracket_index+1:closed_bracket_index]
        # Convert the inside span into a list of tokens and then into a sentence
        inside_brackets_span_list = [t.text for t in inside_brackets_span]
        inside_brackets_span_sentence = ' '.join(inside_brackets_span_list)
        inside_brackets_span_sentence = inside_brackets_span_sentence.replace(" ,", ",").replace("( ", "(").replace(" )", ")")
        # If the inside span sentence is not already in the object, add the brackets span sentence
        if(inside_brackets_span_sentence not in object):
          object = f"{object} {brackets_span_sentence}"
          # Clean up the object string by replacing multiple spaces and parentheses with single spaces and parentheses
          object = object.replace("  ", " ").replace("( ", "(").replace(" )", ")")
          return object
      # Exit the loop once the first opening bracket whose head is in the object is found
      break

  # If none of condtion above is satsified, return the original object string
  return object

In [82]:
# def add_missing_words_after_object(verb, object):
#   verb_subtree_list = [t.text for t in verb.subtree]
#   object_list = object.split()
#   print(verb_subtree_list)
#   print(object_list)

#   last_word_in_object = object_list[-1]
#   print(last_word_in_object)


#   last_word_in_object_index_in_verb_subtree_list = verb_subtree_list.index(last_word_in_object)
#   print(last_word_in_object_index_in_verb_subtree_list)

#   item_index = verb_subtree_list.index(verb_subtree_list[-1])
#   print(item_index)
#   for item in verb_subtree_list:
#     if(item == "." or item == ","):
#       item_index = verb_subtree_list.index(item)
#       print(item_index)

#   missing_words_list = verb_subtree_list[last_word_in_object_index_in_verb_subtree_list + 1 : item_index + 1]


#   print(verb_subtree_list[last_word_in_object_index_in_verb_subtree_list + 1 : item_index + 1])
#   missing_words_text = " ".join(verb_subtree_list[last_word_in_object_index_in_verb_subtree_list + 1 : item_index + 1])
#   object = f"{object} {missing_words_text}"
#   return object

In [83]:
def validate_subject_verb_object_list(doc, sentence, subject_list, verb_list, object_list, subject_verb_object_list, user_stories_components_list):

  for subject_verb_object_dict in subject_verb_object_list:

    subject, pronoun, verb, verb_in_title, object, object_in_title = "", "", "", "", "", ""

    subject = subject_verb_object_dict["subject"]

    if(str(subject) != "a user"):
      # If the subject consists of more than one token, join all the token to form one subject
      subject_raw_tokens = []
      for item in subject.subtree:
        subject_raw_tokens.append(item.text)
        subject_raw_text = " ".join(subject_raw_tokens)

      subject = subject_raw_text.replace(" ,", ",").replace("( ", "(").replace(" )", ")")

    object = subject_verb_object_dict["object"]

    # If the object consists of more than one token, join all the token to form one object
    object_raw_tokens = []
    for item in object.subtree:
      object_raw_tokens.append(item.text)
      object_raw_text = " ".join(object_raw_tokens)

    object = object_raw_text.replace(" ,", ",").replace("( ", "(").replace(" )", ")")

    object_in_title = set_object_pronouns_in_title(subject, object)
    object_in_title = remove_unclosed_brackets(object_in_title)
    object_in_title = object_in_title.rstrip(',')


    verb_in_title = subject_verb_object_dict['verb'].lemma_


    subject = check_if_subject_incorrect(doc, sentence, subject, subject_verb_object_dict['verb'], subject_list)
    subject, pronoun, object = set_subject_and_pronouns(subject, object)
    subject = deal_with_both_in_subject_if_found(doc, sentence, subject)


    object = remove_unclosed_brackets(object)
    object = deal_with_the_sentence_after_object(doc, sentence, subject, subject_verb_object_dict['verb'], object, subject_verb_object_dict['object'], subject_verb_object_list)
    object = fix_unclosed_brackets(object)
    # object = add_missing_words_after_object(subject_verb_object_dict['verb'], object)
    object = object.rstrip(' .').replace("  ", " ").rstrip(' ,')


    verb = subject_verb_object_dict['verb'].lemma_


    isNegative_sentence = subject_verb_object_dict["isNegative_sentence"]

    user_stories_components_dictionary = {"subject": subject, "pronoun": pronoun, "verb": verb, "verb_in_title": verb_in_title, "object": object, "object_in_title": object_in_title, "isNegative": isNegative_sentence}

    user_stories_components_list.append(user_stories_components_dictionary)

In [84]:
# This function takes a parsed document and a sentence within it as input
def check_if_negative_sentence(doc, sentence):
  # Set the variable 'negative_sentence' to False initially
  negative_sentence = False
  # Initialize an empty list 'verbs_finder'
  verbs_finder = []
  # Iterate over each token in the input sentence
  for token in sentence:
    # Check if the token is a negation modifier
    if(token.dep_ == "neg"):
      # If it is a negation modifier, iterate over each word in the sentence up to the negation modifier
      for word in doc[sentence[0].i:token.i]:
        # If the word is a verb, append it to 'verbs_finder'
        if(word.pos_ == "VERB"):
          verbs_finder.append(word)
      # If there are no verbs in 'verbs_finder', it means that the negation modifier is negating the entire sentence
      if(len(verbs_finder) == 0):
        # In this case, set 'negative_sentence' to True
        negative_sentence = True
  # Return the value of 'negative_sentence'
  return negative_sentence

In [85]:
def generate_user_stories_from_sentence(doc, sentence, subject_verb_object_list, user_stories_components_list, user_stories_list):

  print("subject_verb_object_list: {}".format(subject_verb_object_list))

  print("user_stories_components_list: {}".format(user_stories_components_list))

  negative_sentence = check_if_negative_sentence(doc, sentence)

  for user_stories_components_dict in user_stories_components_list:
    # generate a unique id using uuid
    id = uuid.uuid4()
    user_stories_dictionary = {"userStoryTitle": "", "userStoryDescription": "", "userStoryID": id}

    subject = user_stories_components_dict["subject"]
    pronoun = user_stories_components_dict["pronoun"]
    verb = user_stories_components_dict["verb"]
    verb_in_title = user_stories_components_dict["verb_in_title"]
    object = user_stories_components_dict["object"]
    object_in_title = user_stories_components_dict["object_in_title"]

    user_stories_dictionary["userStoryTitle"] = f"{verb_in_title.title()} {object_in_title.title()}"

    
    if(user_stories_components_dict["isNegative"] == False and negative_sentence == False):
      user_stories_dictionary["userStoryDescription"] = f"As {subject}, {pronoun} want to be able to {verb} {object}."

    elif(user_stories_components_dict["isNegative"] == False and negative_sentence == True):
      user_stories_dictionary["userStoryDescription"] = f"As {subject}, {pronoun} should not be able to {verb} {object}."

    elif(user_stories_components_dict["isNegative"] == True):
      user_stories_dictionary["userStoryDescription"] = f"As {subject}, {pronoun} should not be able to {verb} {object}."

    user_stories_list.append(user_stories_dictionary)
    
  print("user_stories_list: {}".format(user_stories_list))

### Case Studies 

In [86]:
text_1 = "Finding and classifying repetitive DNA sequence in eukaryotic genomes is both an important first step ahead of further genome annotation, and also interesting in its own right as repeats frequently drive genome evolution. Repeats in DNA can be broken into a number of different major classes such as LINEs, SINEs and LTRs. Global biodiversity efforts such as Darwin Tree of Life, the European Reference Genome Atlas and the Earth BioGenome Project are producing hundreds and soon thousands of high-quality reference genomes, that will all need repeat annotation. Currently we have two potential approach to annotating repeats. The first is building a repeat library for a species (using RepeatModeler) and then annotating the repeats on the genome (using RepeatMasker). This method both finds and classifies the repeats and finds lineage specific repeats, however building a repeat library is computationally costly. The second approach is to use an extremely fast k-mer approach (REpeatDetector, aka Red), to mask the genome in a fraction of this time. The downside is that this approach does not classify repeats and so is not very informative for researchers studying repeat evolution. In this project we want to explore Deep Learning in order to help classify repeats. We have large existing training sets across hundreds of species, spanning billions of classified repeats. As part of this projects you would train a neural network to take as input an unclassified repeat sequence and label it according to the class of repeats it belongs. You will explore the most efficient approach in terms of both preparing the training data and constructing the network. If the training is successful, we will then test the resulting model from a perspective of compute efficiency, i.e. does the model produce similar results to our existing method of classification (i.e. building a repeat library for the species and then using it to find and classify repeats) and what is the relative compute cost in each approach. Depending on the success and progress related to the above, there may also be the opportunity to take the project a step further, in terms of generative repeat library construction, i.e. given a fast k-mer derived set of repeat sequences and their coordinates on the genome, is it possible to generate a repeat library. This would be highly experimental and only considered after fast and excellent progress on the core project."

text_2 = "Finding and classifying repetitive DNA sequence in eukaryotic genomes is both an important first step ahead of further genome annotation, and also interesting in its own right as repeats frequently drive genome evolution. Global biodiversity efforts such as Darwin Tree of Life, the European Reference Genome Atlas and the Earth BioGenome Project are producing hundreds and soon thousands of high-quality reference genomes, that will all need repeat annotation. Currently our repeat annotation pipelines are run via an in-house workflow management system, eHive. eHive is Perl based and nearing end of life and as a result we are transitioning much or our infrastructure to other workflow managers such as Nextflow. In this project you will work together with us to help redesign our repeat annotation pipeline. We will identify all the existing components, decide what to keep and what to remove and then come up with a final workflow. You will then implement this workflow using Nextflow and test the deployment both locally and on our various cloud partners. Time permitting we will work on costing the pipeline using a variety of species to come up with a cost per gigabase of sequence to mask repeats. Similarly, if there is additional time, we will look at large scale deployment of the pipeline on our species to build a consistent set of repeat resources for public use."

text_3 = "Protein-coding genes form the basis of many scientific analyses. They have directly links to important real world problems such as human health, food security and ecosystem conservation. Global biodiversity efforts such as Darwin Tree of Life, the European Reference Genome Atlas and the Earth BioGenome Project are producing hundreds and soon thousands of high-quality reference genomes, and these genomes need structural annotation of genes."

text_4 = "Testing is a crucial part of the software development process, as it helps to ensure the quality and functionality of the software. It helps to identify bugs, improve the user experience, and it ensures that the software meets the specified requirements and standards. Query languages have well-defined syntax and require specific behavior of the database system. Instead of adding an entire new feature, this project aims at improving the test coverage for the cypher query language in Polypheny-DB. By following the official documentation of the openCypher query language and by systematically adding test cases, existing bugs can be identified, and future regressions can be avoided"

text_5 = "For some applications, especially for those making use of the multimedia and file storage capabilities of Polypheny-DB, it is useful to represent and interact with a table (or the result of an arbitrary query) as file system. With Query to File we already have a prototype implementation of this using FUSE and running on the client computer. The idea of this project is to integrate this concept directly into Polypheny-DB. Instead of an application running on the local machine,"

text_6 = "Currently, there is a JDBC driver and a Python connector for Polypheny-DB. In this project, support for other languages or frameworks shall be added. This project is explicitly for developers with experience with interacting with databases in a specific language or framework. Feel free to link references to experience with that language or framework in your proposal."

text_7 = "Polypheny-DB visualizes query plans in its user interface. Although this feature is very powerful and provides various insights, there is potential for visual improvements. This project idea is about visually improving the plan view. This might include adding the estimated number of row to the edges or making the thickness of the edges depending on it. A proposal for this project idea should include a concept on the planned changes."

text_8 = "CouchDB is a popular document-oriented database system. It features an HTTP query interface that allows querying and manipulating data. The idea of this project is to build a query interface for Polypheny-DB that adheres to the specification of the CouchDB query API. This would allow to seamlessly replace an CouchDB database with Polypheny-DB or to use applications written for ChouchDB with Polypheny-DB."

text_9 = "SPARQL is a query language for RDF graphs and is one of the key technologies of the semantic web. It is used to perform operations such as selecting, inserting, updating, and deleting data. SPARQL is similar to SQL, but it is specifically designed for querying RDF data. The idea of this project is to add native support for SPARQL to Polypheny-DB. Since Polypheny-DB uses the Label-Property-Graph (LPG) model to represent graphs, implementing some kind of mapping will be necessary."

text_10 = "Data source adapters allow to map existing data into the schema of Polypheny-DB. This allows to query the mapped data using the available query languages and features of Polypheny-DB. Furthermore, imported entities can be combined (e.g., joined or unioned) with other tables. The goal of this project is to add an adapter similar to the CSV adapter but for XML files. You can also come up with your own idea for a data source adapter. Data source adapters do not necessarily "

text_11 = "This project is similar to the data source adapter for XML Files. However, instead of querying a static file, the idea of this project is to build an adapter that accesses WikiData. WikiData is a free, open, multilingual knowledge graph maintained by the Wikimedia Foundation. It is a structured database of data that can be linked and reused across multiple Wikipedia projects and other websites. The data in WikiData includes information about entities such as people, places, organizations, and works of art, as well as information about relationships between entities and facts about those entities. The whole database can be downloaded in different formats. This project has the potential to be extended into a large project."

text_12 = "LDAP (Lightweight Directory Access Protocol) is a widely used, open and vendor-neutral, industry standard application protocol for accessing and maintaining distributed directory information services over an Internet Protocol (IP) network. It provides a common interface for accessing and manipulating directory information, such as usernames and passwords, email addresses, and other directory-based information. By adding support for querying Polypheny-DB using LDAP, Polypheny-DB can seamlessly be integrated in applications using LDAP."

text_13 = "Generating the expected HTTP responses is a difficult task. The student is expected to study Jenkins core to identify ways to extract them. For example, they could be extracted from Javadocs and annotations. As part of the community bonding and student project proposal phase, the student is expected to make a few proposals on how to specify and generate the REST API for the Jenkins core and for the plugins. In the case the student finds that it is not possible to generate the REST API from a specification, the student should identify why this is not possible. We also ask the student to explore and propose a way to have REST API of plugins be generated from a REST API specification. For example, some auto-code could populate what the javadoc would look like in an empty-plugin used by the maven plugin generator. The student is also expected to study and propose how the REST API documentation generation could be part of the REST API generator. It might be helpful to automatically generate some code for the REST API when the plugin developer creates a plugin for the first time using the plugin skeleton generator. Any methodology created to handle the REST API should be built into the skeleton generator. The jenkins core REST API and the plugins own REST API need to be versioned separately. It is suggested to focus first on generating the specification, then later look at the versioning of the REST API. Nested objects make versioning challenging. Jenkins users should be easily able to see the REST APIs available for their installed Jenkins. For Jenkins core, this could be done with a URL like: http://localhost/rest/api/1.0. The plugins would have their own REST API path with a version number like: http://localhost/plugin/rest/api/1.0. Plugins and the core would thus have their own version number, and an additional REST API version number. Automated API documentation using the OpenAPI 3.0 specification is part of identifying the API specs."

text_14 = " Moira’s users are able to set up new delivery channels and contacts to be used with those channels. However Moira doesn’t check if the channel configuration is valid and alerts can be actually sent. A user may provide a non-existent Slack user name, block Moira’s bot in Telegram, etc. As a result, such user wouldn’t be able to receive alerts. The bad thing is that sometimes invalid configuration would cause Moira’s bots to be banned for a certain period of time. This effectively means a denial-of-service for alerts which is highly undesirable. The aim of this project is to implement health checks when delivery channels and contacts are set up. To do so, one should enhance the delivery channel and contact setup flow: send a test alert, verify that it’s received, don’t let to save an invalid configuration otherwise. Certain modifications of the web UI may be required."

text_15 = "Moira designed to be API-first solution and all the setup of alerting must be done via HTTP API. Unfortunately Moira’a API right now is not follow all the principles of REST. This means that HTTP methods somewhere are not used correctly and URL paths somewhere are not describe the resources in a right way. Additionally some of the endpoints provide the data which schema is overcomplicated and contains wrong attributes. The great solution for this type of issues will be to use JSON API standard. The aim of this project is to define methods of API that do not follow to the and change it using the REST and JSON API principles."

text_16 = "Explanation. Moira is a huge and complicated software and it operates with a huge amount of data. Sometimes for statistics and troubleshooting we need to define some metrics that will more precise tell us which amount of load Moira is carrying on. The example of this metrics is: amount of triggers with tagged metrics, amount of triggers with huge amount of metrics, amount of triggers with and without subscriptions, etc.To achieve this goal we can create a new microservice that will collect this data from storage and export it to graphite or implement this metrics to existing services."

text_17 = "To provide best user experience Moira’s web UI were developed with accuracy and meant to be as much minimalistic and laconic as possible. But still there are exist pages that do not look perfectly in mobile version of web interface. The example of this pages are following pages: Main page and navigation on it, Subscriptions page, Trigger page and trigger edit pages, Teams page. The aim of this project is to add this pages to mobile version of Moira’s web UI and build the UI with best user experience in mobile environment."

text_18 = "On-call engineers are badly affected by noisy triggers that generate alerts multiple times a day. Attention to alerts reduces greatly, and chances to miss one important alert grow. One badly configured flapping trigger can affect the entire workflow. Our documentation contains an entire page dedicated to this problem with some tips on mitigation. But we can do more. The aim of this project is to help Moira users identify noisy triggers. To do so, one should research and define a metric of trigger noisiness, and then create a UI page that demonstrates worst triggers to the user."

text_19 = "Moira’s web UI is nice and widely used. However, users don’t always want to create triggers, subscriptions, and contacts manually. They would like to be able to automate routine tasks with the tools like Ansible which they already use to bootstrap database and application clusters. For this kind of automation, Moira should have a well-documented API and a number of client libraries for all popular languages. At this point, Moira doesn’t have any API documentation. To use the API, one should study Moira’s source code or an existing client library source code to understand how the API works and reverse-engineer contracts of its methods. The aim of this project is to provide an always up-to-date documentation of Moira’s API and a few client libraries. To do so, one should create an OpenAPI description of API, generate a number of client libraries for popular programming languages with Swagger tools, and setup a process so the documentation and the clients are updated when a new API version is released."

text_20 = "Genes form the basis of many scientific analyses. They have directly links to important real world problems such as human health, food security and ecosystem conservation. Global biodiversity efforts such as Darwin Tree of Life, the European Reference Genome Atlas and the Earth BioGenome Project are producing hundreds and soon thousands of high-quality reference genomes, and these genomes need structural annotation of genes. Genes are made up of many different features, but exons are arguably the key feature as they represent the blocks of the genome that are transcribed in to RNA, which may form functional structures, regulate the expression of other genes or encode proteins. Under certain conditions exons may be included or skipped in the transcribed RNA, sometimes leading to different functional outcomes. A particular permutation of exons that forms a transcribed RNA is known as a transcript. While there is often on particular transcript that represents the normal state of the gene, and thus is most prevalent, it is very common to have alternative transcripts expressed, particularly in higher eukaryotes. These may be expressed in different tissues or points in time, or simply expressed continuously but at a lower level to the dominant transcript. It is important to have as complete a representation of the full set of transcripts in a gene as possible. Short read sequencing is a common method for finding alternative transcript structures, however the nature of the technology means we cannot be certain that the permutations of exons we infer from short read data actually exist in reality. Long read data allows us to directly observer full length RNA and thus should allow us to confidently identify alternative transcripts, but the technology is less common place and also does not capture as many genes as short read data. There are also frequently fragmented data present. The objective of this project will be to examine methods of better representing potential full length transcripts via deep learning. We will preform our test in mammals, where there are several high-quality reference annotations (human and mouse in particular). We will take genes from mammals where large quantities of long read data are available and identify high confidence sets of alternative transcripts. We will then utilise the union of the exons described in these transcripts to attempt to help train a model capable of validating alternative transcripts. There are two approaches we could take, the first would be to find the longest possible exon chain, assume this is the dominant transcript and automatically generate a set of alternative transcripts with exon skipping, where the model would produce a binary output as to whether or not a permutation was valid. This approach would be sraightforward, but as some genes can have many exons, this could generate many permutations. The other approach would be to try and build a generative model, where the input is the union of all unique exons across the input set, while the output would be a set of transcripts and the exons contained in each. This would be more robust, but would require a more complex model. The project will involve you working with us to identify suitable training data from our existing annotations and assessing and implementing a suitable approach to using the data to train a model. Your work will help decide which approach is most viable and you will be responsible for implementing and training the corresponding model. We will test the resulting model in terms of how accurately it can validate true alternative transcripts in both gold standard and non-model mammalian species. Time permitting we may consider extending past mammals into other eukaryotes to see how generalisable it is"

text_21 = "Untranslated regions (UTRs) represent the boundaries of protein-coding genes. These regions are important for understanding where one gene ends and a neighbouring gene starts. UTR regions sometimes house features that regulate the expression of the gene in addition to being key to analysing the expression of the gene when using single cell data. Annotating UTRs is difficult. It is clear from long and short read transcriptomic data that there is rarely a precise start/end to the UTRs of a gene. There are usually regions where there transcriptional machinary is more likely to attach or detach. In particular, short read data (which is most frequently available) is naturally imprecise for determining the start/end of the UTR as each read represents a small fragment of the gene. If the sampling of these small fragments is uneven, it leads to incorrect identification of the start/end. At the same time the cellular machinery for transcription is able to identify these binding/release regions despite not fundamentally changing across eukaryotes, so it should be possible to directly identify their approximate locations directly from the genome sequence. In this project we will explore the use of long read data and high-quality reference annotations to train a model to predict the location of a UTR start or end from a sequence adjacent to a coding region start/end. While it will not be possible to do this for all UTRs, particularly ones that are very long or have large introns contained within, we will be able to train to predict simple UTR start/ends within a fixed window. This will assist with better representation of UTRs, particularly in species lacking transcriptomic data. We will work together to build a training set consisting of genes where we are confident we have captured repesentative UTR boundaries. When several possible boundaries in one of these genes are present, we will select the longest UTR boundary, unless it is infrequently observed relative to the number of long reads mapped to the gene (in which case the boundary will be set to be a balance of the longest UTR observed in more than 20 percent of the reads). We will use as much of the sequence of the flanking region as possible along with the coordinate of the selected boundary, to then train the model to predict the boundary coordinates. You will be responsible for building the network and testing different hyperparameters during training. We will then compare to gold standard reference annotations and look at the approximate distance between the predicted and true boundaries to evaluate the model."

text_22 = "MGnify is a freely available hub for the analysis and exploration of metagenomic, metatranscriptomic, amplicon and assembly data. The resource provides rich functional and taxonomic analyses of user-submitted sequences, as well as analysis of publicly available metagenomic datasets held within the European Nucleotide Archive (ENA). The public-facing service is a React.js website backed by a Python/Django REST API, which serves metagenomics data and associated analyses via API endpoints and data files. There are also micro-services for specific tasks like sequence searches. In addition to the website, MGnify provides hosted Jupyter Notebooks to cover extra use cases and showcase how the MGnify API-provided data can be used in downstream data analysis tasks (using R and Python). Together, the website and notebooks include many data visualisation built using various technologies: Highcharts (Javascript) for website graphics like nucleotide distributions, specialised javascript components like the Integrative Genomics Viewer for genome annotations, and matplotlib and ggplot for graphics created in the Jupyter notebooks. As MGnify approach the release of our next-generation analysis pipeline, the aim is develop a reusable framework for managing these visualisations. Specifically, we aim to reuse components and libraries in as many places as possible, and to support FAIR (Findable, Accessible, Interoperable, Reusable) principles by enabling our users to easily build upon the visualisations we provide. An example could be: the MGnify website using a d3.js histogram to display protein annotation information, from where users can jump to an Observable JS Notebook with the required API fetching code and d3 visualisation code ready for them to modify to produce a graphic suitable for their own publication."

text_23 = "Understanding the impact of genetic variation on disease requires comprehensive gene annotation. Human genes are well characterised following more than two decades of work on their annotation, however, we know that this annotation is not complete and that new experimental methods are generating data to help us towards the goal of complete gene annotation. Long transcriptomic reads allow us to identify and annotate many new features, including the start and end of a transcript which can be combined to give information for genes. We would like to develop a pipeline to extract long transcriptomic data from the European Nucleotide Archive (ENA), map to the human reference genome and extract the terminal co-ordinates to create a growing collection of transcript start/end positions. This data will support improving the accuracy of gene annotation of individual transcripts and genes and give insight into any differences between transcript start and end sites across different tissues"

text_24 = "Understanding the impact of genetic variation on disease requires comprehensive gene annotation. Human genes are well characterised following more than two decades of work on their annotation, however, we know that this annotation is not complete and that new experimental methods are generating data to help us towards the goal of complete gene annotation. We have developed an automated workflow to use long transcriptomic data to add novel alternatively spliced transcripts to our gene annotation. Our method uses very strict thresholds to ensure that no poor-quality models are added to the gene annotation, although as a consequence we reject significant numbers of viable novel transcripts. We want to use machine learning to recover good quality but rejected transcripts and improve the setting of initial filters for new datasets."

text_25 = "Ensembl Metazoa plans every release by manually collecting a list of available species from INSDC resources a few months in advance, and then going over their available information (e.g. taxonomic clade, assembly quality, annotation availability/quality, RefSeq availability, etc.) to filter out and select about 20 species that will be processed and loaded into the next Ensembl release. As an example, taxonomic information is used to highlight species that cover new clades not present in Ensembl, as well as those that bring novel information to existing clades, e.g. new locust genomes in the well-known Neoptera clade. In our plans to expand our Ensembl Metazoa resources we would like to introduce automation in the process described above to check available new species/updates from INSDC resources, as well as create a system that allows us to rank them depending on different criteria. This system should collect the data on a regular basis, e.g. monthly, and provide all the required information to easily ingest it into our production loading system, e.g. GCA, species name, strain, common name, taxonomy,… Additionally, it would be desirable if the new system could rely on our JIRA tracking system to create and update this information, so we can feed this information programmatically into our processing and loading system."

text_26 = "The search engine of any website can be one of the most useful tools for users to help them easily retrieve the information they are looking for. Currently, Ensembl’s search tool works based on indexed fields of our databases, that mainly covers key information, e.g. genes, species, proteins, including many synonyms for every one of them. As we plan to move to our new beta website by the end of 2023, we want to make our search engine even better so our users can enjoy the experience of using Ensembl even more. We would like to expand our Ensembl beta’s search functionality to include and support searching based on taxonomic information. In particular, we are interested in providing users a list of close relatives when a given species is requested and it is not part of Ensembl (yet), return the list of species available given a taxonomic clade instead of a species name, or find a species even when a (homotypic) synonym is provided instead of its current scientific name. The objective of this project is to create a standalone Elasticsearch tool that can handle taxonomic-related requests."

text_27 = "Our TA grading interface is elaborate, highly-featured, and customizable. However, the interface is visually overwhelming to new graders. Some of our TA grading features are not adequately tested by automated unit and end-to-end regression testing. Finally, the performance of these webpages is problematic for large courses due to inefficient database queries. Expected Outcomes: The goal would be to expand the automated testing of the TA Grading pages, patch bugs uncovered by this improved testing, refactor the existing code and SQL queries to improve performance, and possibly propose and execute small user interface revisions."

text_28 = "Currently, instructors must write a configuration as a config.json (and any necessary additional files) and upload or store these files on the local file system. We would like to provide an alternate web GUI interface for creating basic or moderately complex autograding configurations. We have preliminary support for automated creation of expected output files (from and instructor solution – currently limited to Python) and randomized test case input. This project will involve multiple modules of Submitty including web UI development, integration, documentation, additional tutorial examples, and extending output generation to instructor solutions in compiled languages. Expected Outcomes: The goal would be to streamline the assignment configuration process for non-technical instructors, relevant for use in non-computer-science/non-programming courses."

text_29 = "Automated testing of student submitted software carries system and security risks from malicious code but also simply buggy or inefficient code. Upper level coursework on advanced topics in computer science including networking, operating systems, and kernel development are especially complex challenges. Submitty supports a variety of tools to securely test including both sandboxing and containerization (Docker). These tools must manage and limit system resources (time, CPU, processes, memory, files, system calls, sockets, etc.) The next step is to facilitate the creation of instructor-customized container images (with specific languages, packages, databases, etc.). Care must be taken to ensure small container size and efficient performance. Advanced project idea: We would like to use Submitty to automatically test and grade homework assignments that require modifications to the operating system kernel. Before doing so on a production machine, we need to do testing to ensure the right controls are in place. Expected Outcomes: Increased usage of containerized autograding in all levels of courses. Reduced size and improved performance of containerized autograding for our autograding tutorial examples and selected real-world use cases of autograding."

text_30 = "Each commit and pull request to github launches continuous integration testing of a portion of the Submitty code base. We would like to expand the code coverage of our unit and integration tests. Furthermore, some of our more complex end-to-end test case are not currently run automatically with each GitHub pull request, because the system setup is too time consuming and lengthy or unpredictable running times affect test stability. We would like to optimize our use of GitHub Actions and caching so we can run all of these test cases. Expected Outcomes: Increased code coverage and stability of the Submitty CI test suite, increased automation of CI testing, increased performance (decreased running time) for CI testing through GitHub Actions."

text_31 = "Submitty is responsible for securing confidential information. It is important that we regularly assess the security of this data. Once a potential vulnerability is found, the system must be promptly patched and documented to prevent future problems. Expected Outcomes: Security risk assessment, identification and repair of specific security vulnerabilities, expansion and creation of continuous integration tools to prevent introduction of new vulnerabilities."

text_32 = "Most print jobs are sent via the print dialog of a desktop application, like evince, Chrome, LibreOffice, DarkTable, … Print dialogs are usually, like “Open …” or “Save as …” dialogs, provided by the GUI toolkits, in most cases GTK or Qt, sometimes applications come also with their own creations, like LibreOffice or Chromium. Problem here is usually not the design of the dialog itself, most are actually easy to use, but the way how they connect to CUPS (and also to other print technologies) and how well this connection code is maintained and kept up-to-date. GUI toolkit projects are large projects, often with long release cycles and all with a certain inertia, and there are things which many people are eager to work on, and others, like print dialogs, which have to be there but no one is really motivated to push their development forward and do the needed maintenance work. An important part of the maintenance of a GUI toolkit is that it interfaces well and correctly with the underlying operating system, graphics, sound, storage, …, and printing! The OS is under continuous development, things are changing all the time, components get replaced by others, printing is CUPS for 23 years, but within CUPS we have also changes, and they need to be taken care of in the print dialogs. Several years back, CUPS started to create temporary queues for driverless IPP network printers (or remote CUPS printers, which are emulations of IPP printers), which are only physically available when they are accessed (capabilities are polled or job printed). Print dialogs used an old API which did not support this, the temporary queues did not appear in the dialog, a helper daemon, cups-browsed had to convert the temporary queues into physical queues as a workaround. The correct solution had been to change the print dialogs to a newer CUPS API which supports these queues, but no one at the GUI toolkit projects has felt responsible and taken the time for this update for many years. Only recently this got fixed. This made me introducing the Common Print Dialog Backends (CPDB) back in 2017, a de-coupling of the print technology (CUPS, print-to-file, that time also Google Cloud Print) from the GUI. The GUI projects have to adopt the CPDB support only once and then OpenPrinting (or any upcoming cloud printing projects) takes care of the CPDB backend for the print technologies to be up-to-date with any changes. This way print technology projects can react quickly and are not dependent any more on the GUI toolkit’s inertia. The print dialogs of the major GUI toolkits, GTK, Qt, got CPDB support added in GSoC 2022, but several applications come with their own creation of a print dialog. AFAIK these are Firefox/Thunderbird (Mozilla), Chromium/Chrome (Google), and LibreOffice. Also these dialogs need to get CPDB support to make CPDB universal. Then we are especially prepared for the switch to CUPS 3.x which does not support PPD files any more, as the CUPS backend of CPDB is already using only CUPS APIs not handling PPD files. And we are also prepared for IPP infrastructure/cloud printing for which we also want to create a CPDB backend (see below). The contributor's task is to get CPDB into the print dialogs upstream, the UI of them does not need to be changed. Dialogs to be treated are Mozilla for Firefox and Thunderbird, Chromium/Chrome, LibreOffice, and any other application-specific dialog. For LibreOffice there was already worked on CPDB support back in 2017, but in the meantime things have changed and the dialog needs to get updated, especially for the new features of CPDB 2.x (human-readable strings/translations, option groups, …)."

text_33 = "cups-browsed is a helper daemon for CUPS to automatically set up network printers. In the beginning it was to overcome that when CUPS from 1.6.x on used DNS-SD instead of its own browsing/broadcasting, it did not auto-setup client queues any more. With the time it got lots of more functionality: Legacy CUPS browsing/broadcasting for interoperability with CUPS 1.5.x and older (often in long-term support enterprise distros), clustering, manually and automatically, also for clusters of printers of completely different types, user has one “universal” print queue and by their option settings job goes to the correct printer. Also filtering lists of many printers is supported, and everything can be configured/fine-tuned by the user or admin. With CUPS already having its temporary queue functionality for network printers without need of explicit manual setup, and the Common Print Dialog Backends getting into the print dialogs and talking to CUPS with modern interfaces, we do not need automatic queue creation for network printers any more, but the other functionality of cups-browsed is still very useful. So we do not want to discontinue cups-browsed, but take it into the New Architecture of printing, giving it the appropriate modern interface. Currently cups-browsed discovers printers via DNS-SD, and then creates (or not creates) local print queues pointing to them according to the rules in its configuration file. But currently it creates classic CUPS queues, with PPD files generated according to the printer's IPP attributes. What we need is make it working with CUPS 3.x, which drops PPD files and classic printer drivers. For this we want tom turn it into a Printer Application, the new printer driver format, emulating a driverless IPP printer. This way CUPS can access the printers created by cups-browsed and create temporary queues for them on-demand. Internally we define with configuration file what these queues should do: Clusters, retro-fit to old CUPS, … The contributor's task is to implement this transition, using PAPPL for all standard elements of a Printer Application, like daemon, IPP parser, web admin interface, … They will make cups-browsed create a queue in the Printer Application if appropriate destination printers get discovered, and remove it when these printers disappear (turned off, user leaves network, …). CUPS will simply pick up on the emulated IPP printers then. And there will be a web interface to be created, for the configuration of the clusters, filter rules, …. one does not need to edit cups-browsed.conf manually any more."

text_34 = "Gutenprint is a high-quality printer driver for a wide range of inkjets, especially Epson and Canon, dye-sublimation printers and even monochrome PCL laser printers. It does not only cover many printers to give them support under Linux and free software operating systems at all, but also is optimized for highest possible print quality, so that at least on some printers and with the right settings you can even get better print quality than with the original (Windows/Mac) drivers. Gutenprint is usually used as classic CUPS driver with a CUPS filter and a PPD file generator. As, as mentioned above, CUPS will not support PPD files any more from version 3.x on and when using the CUPS Snap one cannot install PPD-based drivers already now. So a Printer Application of Gutenprint is needed. There is already one, but it is a retro-fit of the classic CUPS driver. The Printer Application simply calls the PPD generator and the filter at the right places to do its job. As Gutenprint contains all its printer support and printer capability info in libgutenprint or in files which are read by libgutenprint, the PPD generator and the filter only containing calls of functions in libgutenprint, it should be easy to create a PAPPL-based, native Printer Application for Gutenprint. Here on an incoming get-printer-attributes IPP request we call the same functions which the PPD generator calls, but instead of translating the responses into a PPD file we translate it into the IPP answer for the get-printer-attributes request. And when we have a job to print, we call the library functions which the filter calls, but directly. This does not only save us from resource-consuming calls of external executables but we are also no harnessed by the PPD file syntax and so have more flexibility in the UI representations of the (often more than 100) printer-specific options. Also, generally we should completely do away with the PPDs. Retro-fitting is only an ugly interim solution or for drivers which are not actively maintained anymore and for printers we do not have at hand and so cannot test the drivers."

# The below texts are Case studies used in a research papper to test the accuracy of similler USG tool:

text_35 = "The Payroll Administrator maintains employee information. The Payroll Administrator is responsible for adding new employees, deleting employees and changing all employee information such as title, address, and payment classification (hourly, salaried, commissioned), as well as running administrative reports."

text_36 = "First, I want the website to be a place for the local scene. Somewhere the kids can come and check out upcoming events—surf competitions, lessons, things like that. Second, I need a place to sell merchandise. Boards, wet suits, clothes, videos, and things like that. But it’s gotta be easy to use and look really good. Third, I’ve always wanted a webcam pointing at the beach. This way, you don’t have to come down to check out the conditions. You can just open your laptop, go to the website, and see whether it’s worth getting up. This also means the website has to be fast."

text_37 = "The user of the application is able to track his/her performance when running or riding his/her bike via the GPS. His/her performance can be saved to his/her account and shared with other friends from his/her social networks. The user cannot delete any entries, once they are saved to the account. The user has the ability to create a report with all the activities by date range, or by type (running or biking)."

text_38 = "The bakery system will maintain data from customers, products, service supplier, employees as well as generate administrative reports. The maintenance feature includes adding, deleting, and updating all customer, users and employees information that will be: name, address, date of birth and for employees payment classification too. And for the users of the bakery system, it will be saved the personal information, an id and password too. The products maintenance feature will be about to create product in the system with information such as name, value for sale, date of sale, value of purchase, quantity, value off when applicable. This feature will include, deleting, and updating the product information as well. The administrative reports will generate data from sales, crossing information with the date and value of sales. It will also include an alert function when items and products are bellow the minimum amount defined by administrative users. Finally, the system also provides a cash flow that will be the feature that provides information for administrative reports." 

### Coreference Resolution Using StanfordCoreNLP

In [87]:
# A StanfordCoreNLP object is instantiated, which starts the CoreNLP server. The path to the CoreNLP directory is passed as an argument to the constructor.
# Start the Stanford CoreNLP server
stanford_nlp = StanfordCoreNLP(os.path.join(os.getcwd(), 'stanford-corenlp-full-2018-10-05'))

# Define the input text
text = text_35


# The CoreNLP pipeline is set up using a Python dictionary. The coref annotator is used for coreference resolution, and the pipelineLanguage parameter is set to en for English. The outputFormat parameter is set to json so that the output can be easily parsed.
# Set up the CoreNLP pipeline
props = {
    'annotators': 'coref',
    'pipelineLanguage': 'en',
    'outputFormat': 'json'
}

# The StanfordCoreNLP.annotate() method is used to perform coreference resolution on the input text. The text and the pipeline configuration are passed as arguments. The output is returned as a JSON-formatted string.
# The output from the CoreNLP server is a JSON-formatted string, which is parsed into a Python dictionary using the json.loads() method. The resulting dictionary contains information about the input text and its coreferences.
# Perform coreference resolution
result = stanford_nlp.annotate(text, properties=props)
result = json.loads(result)

# To replace each pronoun with its corresponding entity, the script loops over the corefs key in the resulting dictionary. For each coreference cluster, the representative mention is determined and all mentions in the cluster are replaced with it.
# Replace pronouns with their corresponding entities
# The first line of the code initializes a loop over all the coreference chains returned by Stanford CoreNLP
for coref in result['corefs']:
    # For each coreference chain, the code sorts the mentions in the chain by their start index
    # The idea here is to replace each pronoun with the noun phrase that occurs earlier in the text. Sorting the mentions by their start index ensures that we always replace a pronoun with the closest noun phrase that it refers to.
    mentions = sorted(result['corefs'][coref], key=lambda x: x['startIndex'])
    # The code then initializes a variable called replace_with to the text of the first mention in the sorted list
    # This variable will hold the final noun phrase that the pronoun will be replaced with.
    replace_with = mentions[0]['text']
    # The code then loops over the mentions in the chain and checks if the current mention is the representative mention
    # The representative mention is the mention that represents the entire chain. For example, in the sentence "John saw Mary in the park. He waved at her.", the representative mention of the pronoun "her" would be "Mary". The code updates the replace_with variable to hold the text of the representative mention if the current mention is the representative mention.
    for mention in mentions:
        if mention['isRepresentativeMention']:
            replace_with = mention['text']
            break
    # Finally, the code loops over all the tokens in the input text and checks if the current token is a pronoun that refers to the current coreference chain
    for sentence in result['sentences']:
        for token in sentence['tokens']:
            # If the current token is a pronoun that refers to the current coreference chain, the code updates the token's word attribute to hold the text of the noun phrase that the pronoun refers to.
            if token['originalText'] in mentions[-1]['text'] and token['pos'] == 'PRP':
                token['word'] = replace_with

# Finally, the updated text is assembled by looping over the sentences key in the resulting dictionary, joining the tokens back into text, and adding the appropriate punctuation.
# Join the tokens back into text
new_text = ''
for sentence in result['sentences']:
    for token in sentence['tokens']:
        new_text += token['word'] + ' '
    new_text = new_text[:-1]

# The end result is a modified version of the input text where all the pronouns have been replaced with the corresponding noun phrases that they refer to.
# Print the updated text
print(new_text)

# Stop the Stanford CoreNLP server
stanford_nlp.close()

The Payroll Administrator maintains employee information .The Payroll Administrator is responsible for adding new employees , deleting employees and changing all employee information such as title , address , and payment classification -LRB- hourly , salaried , commissioned -RRB- , as well as running administrative reports .


In [88]:
final_text = new_text.replace(" .", ". ").replace(" ,", ",").replace("-LRB- ", "(").replace(" -RRB-", ")")
print(final_text)

The Payroll Administrator maintains employee information. The Payroll Administrator is responsible for adding new employees, deleting employees and changing all employee information such as title, address, and payment classification (hourly, salaried, commissioned), as well as running administrative reports. 


### Rule Based User Stories Generation Using Spacy 

In [89]:
nlp = spacy.load("en_core_web_lg")


# put final_text if you want to use Coreference Resolution or if you want to skip using Coreference Resolution, you could put any of the (text_1 ---> text_38) instead.
doc = nlp(final_text)


sentences_list = list(doc.sents)

for sentence in sentences_list:
  print("Sentence: {}".format(sentence))

  subject_list = []
  verb_list = []
  object_list = []
  subject_verb_object_list = []
  user_stories_components_list = []
  user_stories_list = []
  
  for token in sentence:
    get_subject_verb_object_from_sentence(token, subject_list, verb_list, object_list)


  create_subject_verb_object_list(doc, sentence, subject_list, verb_list, object_list, subject_verb_object_list)

  validate_subject_verb_object_list(doc, sentence, subject_list, verb_list, object_list, subject_verb_object_list, user_stories_components_list)

  generate_user_stories_from_sentence(doc, sentence, subject_verb_object_list, user_stories_components_list, user_stories_list)

  print("______________________________________________________________________________________________")

Sentence: The Payroll Administrator maintains employee information.
subject_verb_object_list: [{'subject': Administrator, 'verb': maintains, 'object': information, 'isNegative_sentence': False}]
user_stories_components_list: [{'subject': 'the payroll administrator', 'pronoun': 'I', 'verb': 'maintain', 'verb_in_title': 'maintain', 'object': 'employee information', 'object_in_title': 'employee information', 'isNegative': False}]
user_stories_list: [{'userStoryTitle': 'Maintain Employee Information', 'userStoryDescription': 'As the payroll administrator, I want to be able to maintain employee information.', 'userStoryID': UUID('9cf208ca-057a-445d-a39e-34e35edf5910')}]
______________________________________________________________________________________________
Sentence: The Payroll Administrator is responsible for adding new employees, deleting employees and changing all employee information such as title, address, and payment classification (hourly, salaried, commissioned), as well as r

# Experminting 

In [None]:
#text = "The Payroll Administrator maintains employee information. The Payroll Administrator is responsible for adding new employees, deleting employees and changing all employee information such as title, address, and payment classification (hourly, salaried, commissioned), as well as running administrative reports."

#text = "The product catalog should be easy to navigate and allow users to filter products by different criteria such as price range, product type, and brand. The shopping cart should display the products added by the user and the total price. The checkout process should be simple and secure, and users should be able to provide their name, shipping address, and payment method easily. We also identified the need for a payment gateway to ensure secure payments. Lastly, users should be able to view their order history, change their details, and manage their account from a user profile page. To capture the needs of the users, we created user stories that reflect the requirements. These stories include browsing and filtering products, adding products to the shopping cart, checking out easily and securely, receiving email confirmation of orders, and viewing order history from a user profile page. We will use these user stories to guide the development of the website and ensure that all the requirements are met. Overall, we want to create a website that meets the needs of our users and provides them with a seamless shopping experience. We look forward to working together to bring this project to life."

#text = "Overall, we want to create a website that meets the needs of our users and provides them with a seamless shopping experience."

#text = "Finding and classifying repetitive DNA sequence in eukaryotic genomes is both an important first step ahead of further genome annotation, and also interesting in its own right as repeats frequently drive genome evolution. Repeats in DNA can be broken into a number of different major classes such as LINEs, SINEs and LTRs. Global biodiversity efforts such as Darwin Tree of Life, the European Reference Genome Atlas and the Earth BioGenome Project are producing hundreds and soon thousands of high-quality reference genomes, that will all need repeat annotation. Currently we have two potential approach to annotating repeats. The first is building a repeat library for a species (using RepeatModeler) and then annotating the repeats on the genome (using RepeatMasker). This method both finds and classifies the repeats and finds lineage specific repeats, however building a repeat library is computationally costly. The second approach is to use an extremely fast k-mer approach (REpeatDetector, aka Red), to mask the genome in a fraction of this time. The downside is that this approach does not classify repeats and so is not very informative for researchers studying repeat evolution. In this project we want to explore Deep Learning in order to help classify repeats. We have large existing training sets across hundreds of species, spanning billions of classified repeats. As part of this projects you would train a neural network to take as input an unclassified repeat sequence and label it according to the class of repeats it belongs. You will explore the most efficient approach in terms of both preparing the training data and constructing the network. If the training is successful, we will then test the resulting model from a perspective of compute efficiency, i.e. does the model produce similar results to our existing method of classification (i.e. building a repeat library for the species and then using it to find and classify repeats) and what is the relative compute cost in each approach. Depending on the success and progress related to the above, there may also be the opportunity to take the project a step further, in terms of generative repeat library construction, i.e. given a fast k-mer derived set of repeat sequences and their coordinates on the genome, is it possible to generate a repeat library. This would be highly experimental and only considered after fast and excellent progress on the core project."

#text = "The Payroll Administrator maintains employee information. He is responsible for adding new employees, deleting employees and changing all employee information such as title, address, and payment classification (hourly, salaried, commissioned), as well as running administrative reports."

#text = "The Payroll Administrator maintains employee information for example such as not impinfo and not imp. The Payroll Administrator is responsible for adding new employees such as addnewmp and addoldemp, deleting employees such as delnewmp and deloldemp, and changing all employee information such as title, address, and payment classification (hourly, salaried, commissioned), as well as running administrative reports such as imprepo and not impreports."

#text = "The Payroll Administrator is not responsible for adding new employees, deleting employees and changing all employee information such as title, address, and payment classification (hourly, salaried, commissioned), as well as running administrative reports."

#text = "The Payroll Administrator maintains employee information for example such as not impinfo and not imp. The Payroll Administrator is responsible for adding new employees such as addnewmp and addoldemp, deleting employees such as delnewmp and deloldemp, and changing all employee information such as title, address, and payment classification (hourly, salaried, commissioned), as well as running administrative reports such as imprepo and not impreports."

#text = "The product catalog should be easy to navigate and allow users to filter products by different criteria such as price range, product type, and brand."

#text = "The product catalog should be easy to navigate and allow users to filter products by different criteria such as price range, product type, and brand. The shopping cart should display the products added by the user and the total price. The checkout process should be simple and secure, and users should be able to provide their name, shipping address, and payment method easily. We also identified the need for a payment gateway to ensure secure payments. Lastly, users should be able to view their order history, change their details, and manage their account from a user profile page. To capture the needs of the users, we created user stories that reflect the requirements. These stories include browsing and filtering products, adding products to the shopping cart, checking out easily and securely, receiving email confirmation of orders, and viewing order history from a user profile page. We will use these user stories to guide the development of the website and ensure that all the requirements are met. Overall, we want to create a website that meets the needs of our users and provides them with a seamless shopping experience. We look forward to working together to bring this project to life."

#text = "The employee information is maintained by the Payroll Administrator."

#text = "Repeats in DNA can be broken into a number of different major classes such as LINEs, SINEs and LTRs."

#Sentence = "This method is responsible for both finding and classifing the repeats and finding lineage specific repeats, however building a repeat library is computationally costly."

#Sentence = "Finding and classifying repetitive DNA sequence in eukaryotic genomes is both an important first step ahead of further genome annotation, and also interesting in its own right as repeats frequently drive genome evolution."

#Sentence = "The downside is not very informative for researchers studying repeat evolution."

#text = "The Payroll Administrator is responsible for adding new employees such as addnewmp and addoldemp, deleting employees such as delnewmp and deloldemp, and changing all employee information such as title, address, and payment classification (hourly, salaried, commissioned), as well as running administrative reports such as imprepo and not impreports."

#Sentence = " As part of this projects you would train a neural network to take as input an unclassified repeat sequence and label a neural network according to the class of repeats a neural network belongs."

In [None]:
doc = nlp("Hello, (world)! []{}<>")

for token in doc:
    print(token.text, token.is_left_punct)

Hello False
, False
( True
world False
) False
! False
[ True
] False
{ True
} False
< True
> False


In [None]:
Sentence = " As part of this projects you would train a neural network to take as input an unclassified repeat sequence and label a neural network according to the class of repeats a neural network belongs."

doc = nlp(Sentence)


print('{:<12}{:<10}{:<10}{:<10}'.format('token_text', 'token_pos', 'token_dep', 'token_lemm'))

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    token_lemm = token.lemma_
    # This is for formatting only
    print('{:<12}{:<10}{:<10}{:<10}'.format(token_text, token_pos, token_dep, token_lemm))

token_text  token_pos token_dep token_lemm
            SPACE     dep                 
As          ADP       prep      as        
part        NOUN      pobj      part      
of          ADP       prep      of        
this        DET       det       this      
projects    NOUN      pobj      project   
you         PRON      nsubj     you       
would       AUX       aux       would     
train       VERB      ROOT      train     
a           DET       det       a         
neural      ADJ       amod      neural    
network     NOUN      dobj      network   
to          PART      aux       to        
take        VERB      xcomp     take      
as          ADP       prep      as        
input       NOUN      pobj      input     
an          DET       det       an        
unclassifiedADJ       amod      unclassified
repeat      NOUN      compound  repeat    
sequence    NOUN      dobj      sequence  
and         CCONJ     cc        and       
label       VERB      conj      label     
a        

In [None]:
Sentence = " As part of this projects you would train a neural network to take as input an unclassified repeat sequence and label a neural network according to the class of repeats a neural network belongs."

doc = nlp(Sentence)

for token in doc:
  if(token.pos_ == "ADP"):
    print(token)
    # print(list(token.ancestors))
    # print(list(token.subtree))
    print(list(token.sent))

As
[ , As, part, of, this, projects, you, would, train, a, neural, network, to, take, as, input, an, unclassified, repeat, sequence, and, label, a, neural, network, according, to, the, class, of, repeats, a, neural, network, belongs, .]
of
[ , As, part, of, this, projects, you, would, train, a, neural, network, to, take, as, input, an, unclassified, repeat, sequence, and, label, a, neural, network, according, to, the, class, of, repeats, a, neural, network, belongs, .]
as
[ , As, part, of, this, projects, you, would, train, a, neural, network, to, take, as, input, an, unclassified, repeat, sequence, and, label, a, neural, network, according, to, the, class, of, repeats, a, neural, network, belongs, .]
to
[ , As, part, of, this, projects, you, would, train, a, neural, network, to, take, as, input, an, unclassified, repeat, sequence, and, label, a, neural, network, according, to, the, class, of, repeats, a, neural, network, belongs, .]
of
[ , As, part, of, this, projects, you, would, tra

In [None]:
Sentence = " As part of this projects you would train a neural network to take as input an unclassified repeat sequence and label a neural network according to the class of repeats a neural network belongs."

doc = nlp(Sentence)

for token in doc:
  if(token.pos_ == "VERB"):
    print(token)
    print(list(token.subtree))

train
[ , As, part, of, this, projects, you, would, train, a, neural, network, to, take, as, input, an, unclassified, repeat, sequence, and, label, a, neural, network, according, to, the, class, of, repeats, a, neural, network, belongs, .]
take
[to, take, as, input, an, unclassified, repeat, sequence, and, label, a, neural, network, according, to, the, class, of, repeats, a, neural, network, belongs]
label
[label, a, neural, network, according, to, the, class, of, repeats]
according
[according, to, the, class, of, repeats]
belongs
[a, neural, network, belongs]


In [None]:
my_list = [1, 2, 3, 4, 3, 5, 3]
item = 3

indices = [index for index in range(len(my_list)) if my_list[index] == item]
last_index = indices[-1]

print(last_index)

6


In [None]:
text = "i love (rozy"

print(text[-1])

print(text + ")")

y
i love (rozy)


In [None]:
my_list = [1,2,3,4,5,6,7]
index_of_4 = my_list.index(4)

for i in my_list[index_of_4:]:
  print(i)

print("_____________________")

for i in my_list[:index_of_4+1][::-1]:
  print(i)


4
5
6
7
_____________________
4
3
2
1


In [None]:
my_list = [1, 2, 3, 4, 5, 6]

my_list_reversed = my_list[::-1]

print(my_list[3:])
print(my_list_reversed[:3])

# print(my_list[0:3])
# print(my_list[::-1])
# print(my_list[2::-1])

[4, 5, 6]
[6, 5, 4]


In [None]:
text =  "The book was written by the author"

doc = nlp(text)


print('{:<12}{:<10}{:<10}{:<10}'.format('token_text', 'token_pos', 'token_dep', 'token_lemm'))

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    token_lemm = token.lemma_
    # This is for formatting only
    print('{:<12}{:<10}{:<10}{:<10}'.format(token_text, token_pos, token_dep, token_lemm))

token_text  token_pos token_dep token_lemm
The         DET       det       the       
book        NOUN      nsubjpass book      
was         AUX       auxpass   be        
written     VERB      ROOT      write     
by          ADP       agent     by        
the         DET       det       the       
author      NOUN      pobj      author    


In [None]:
text = "The Payroll Administrator maintains employee information for example such as not impinfo and not imp."

doc = nlp(text)


print('{:<12}{:<10}{:<10}{:<10}'.format('token_text', 'token_pos', 'token_dep', 'token_lemm'))

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    token_lemm = token.lemma_
    # This is for formatting only
    print('{:<12}{:<10}{:<10}{:<10}'.format(token_text, token_pos, token_dep, token_lemm))

token_text  token_pos token_dep token_lemm
The         DET       det       the       
Payroll     PROPN     compound  Payroll   
AdministratorPROPN     nsubj     Administrator
maintains   VERB      ROOT      maintain  
employee    NOUN      compound  employee  
information NOUN      dobj      information
for         ADP       prep      for       
example     NOUN      pobj      example   
such        ADJ       amod      such      
as          ADP       prep      as        
not         PART      neg       not       
impinfo     NOUN      pobj      impinfo   
and         CCONJ     cc        and       
not         PART      neg       not       
imp         ADJ       conj      imp       
.           PUNCT     punct     .         


In [None]:
text = "The Payroll Administrator maintains employee information for example such as not impinfo and not imp."

doc = nlp(text)

for token in doc:
  if(token.pos_ == "ADP"):
    print(token)
    print(list(token.subtree))

for
[for, example, such, as, not, impinfo, and, not, imp]
as
[such, as, not, impinfo, and, not, imp]


In [None]:
text = "The Payroll Administrator is not responsible for adding new employees, deleting employees and changing all employee information such as title, address, and payment classification (hourly, salaried, commissioned), as well as running administrative reports."

doc = nlp(text)

for token in doc:
  if(token.lemma_ == "be"):
    print(token)
    print(list(token.children))

is
[Administrator, not, responsible, .]


In [None]:
Sentence = "Finding and classifying repetitive DNA sequence in eukaryotic genomes is both an important first step ahead of further genome annotation, and also interesting in its own right as repeats frequently drive genome evolution."

doc = nlp(Sentence)

for token in doc:
  if(token.pos_ == "VERB"):
    print(token)
    print(list(token.conjuncts))

Finding
[classifying]
classifying
[Finding]
drive
[]


In [None]:
Sentence = text_1

doc = nlp(Sentence)

for token in doc:
  if(token.pos_ == "VERB"):
    print(token)
    print(list(token.subtree))

Finding
[Finding, and, classifying, repetitive, DNA, sequence, in, eukaryotic, genomes]
classifying
[classifying, repetitive, DNA, sequence, in, eukaryotic, genomes]
drive
[both, an, important, first, step, ahead, of, further, genome, annotation, ,, and, also, interesting, in, its, own, right, as, repeats, frequently, drive, genome, evolution]
broken
[Repeats, in, DNA, can, be, broken, into, a, number, of, different, major, classes, such, as, LINEs, ,, SINEs, and, LTRs, .]
producing
[Global, biodiversity, efforts, such, as, Darwin, Tree, of, Life, ,, the, European, Reference, Genome, Atlas, and, the, Earth, BioGenome, Project, are, producing, hundreds, and, soon, thousands, of, high, -, quality, reference, genomes]
need
[Global, biodiversity, efforts, such, as, Darwin, Tree, of, Life, ,, the, European, Reference, Genome, Atlas, and, the, Earth, BioGenome, Project, are, producing, hundreds, and, soon, thousands, of, high, -, quality, reference, genomes, ,, that, will, all, need, repeat,

In [None]:
# Sentence = "This method both finds and classifies the repeats and finds lineage specific repeats, however building a repeat library is computationally costly."
text = "The Payroll Administrator maintains employee information. The Payroll Administrator is responsible for adding new employees, deleting employees and changing all employee information such as title, address, and payment classification (hourly, salaried, commissioned), as well as running administrative reports."
Sentence = "The downside is that this approach does not classify repeats and so that is not very informative for researchers studying repeat evolution."
doc = nlp(Sentence)

for token in doc:
  if(token.dep_ == "nsubj"):
    print(token)
    print(list(token.subtree))

downside
[The, downside]
approach
[this, approach]
that
[that]


In [None]:
Sentence = "This method both finds and classifies the repeats and finds lineage specific repeats, however building a repeat library is computationally costly."
doc = nlp(Sentence)

token_list = [t.text.lower() for t in doc[0:6]]

token_list

['this', 'method', 'both', 'finds', 'and', 'classifies']

In [None]:
print(doc)

The first is building a repeat library for a species (using RepeatModeler) and then annotating the repeats on the genome (using RepeatMasker).


In [None]:
Sentence = "Finding and classifying repetitive DNA sequence in eukaryotic genomes is both an important first step ahead of further genome annotation, and also interesting in its own right as repeats frequently drive genome evolution."

doc = nlp(Sentence)

print(doc[0].i)

0


In [None]:
test_text = "The product catalog should be easy to navigate and allow users to filter products by different criteria such as price range, product type, and brand."

doc = nlp(test_text)


print('{:<12}{:<10}{:<10}{:<10}'.format('token_text', 'token_pos', 'token_dep', 'token_lemm'))

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    token_lemm = token.lemma_
    # This is for formatting only
    print('{:<12}{:<10}{:<10}{:<10}'.format(token_text, token_pos, token_dep, token_lemm))

token_text  token_pos token_dep token_lemm
The         DET       det       the       
product     NOUN      compound  product   
catalog     NOUN      nsubj     catalog   
should      AUX       aux       should    
be          AUX       ROOT      be        
easy        ADJ       acomp     easy      
to          PART      aux       to        
navigate    VERB      xcomp     navigate  
and         CCONJ     cc        and       
allow       VERB      conj      allow     
users       NOUN      nsubj     user      
to          PART      aux       to        
filter      VERB      ccomp     filter    
products    NOUN      dobj      product   
by          ADP       prep      by        
different   ADJ       amod      different 
criteria    NOUN      pobj      criterion 
such        ADJ       amod      such      
as          ADP       prep      as        
price       NOUN      compound  price     
range       NOUN      pobj      range     
,           PUNCT     punct     ,         
product    

In [None]:
my_list = [1, 2, 3, 4, 5, 6, 7, 8, 9]

for item1, item2 in zip(my_list[:-1], my_list[1:]):
    print(type(item1),item2)

<class 'int'> 2
<class 'int'> 3
<class 'int'> 4
<class 'int'> 5
<class 'int'> 6
<class 'int'> 7
<class 'int'> 8
<class 'int'> 9


In [None]:
text = "The Payroll Administrator is not responsible for adding new employees, deleting employees and changing all employee information such as title, address, and payment classification (hourly, salaried, commissioned), as well as running administrative reports."

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")

doc = nlp(text)

sentences_list = list(doc.sents)

for sentence in sentences_list:
  for token in sentence:
    if(token.dep_ == "neg"):
      print(token, list(token.ancestors))
  print("____________________________________________________")

not [is]
____________________________________________________


In [None]:
text = "The employee information is maintained by the Payroll Administrator."

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")

doc = nlp(text)

sentences_list = list(doc.sents)

for sentence in sentences_list:
  for token in sentence:
    if(token.dep_ == "pobj"):
      print(token, list(token.ancestors))
  print("____________________________________________________")

Administrator [by, maintained]
____________________________________________________


In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")

doc = nlp(text)

sentences_list = list(doc.sents)

for sentence in sentences_list:
  for token in sentence:
    if(token.dep_ == "nsubjpass"):
      print(token, list(token.ancestors))
  print("____________________________________________________")

information [maintained]
____________________________________________________


In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")

doc = nlp(final_text)

sentences_list = list(doc.sents)

for sentence in sentences_list:
  verb_number = check_verb_number_in_sentence(sentence)
  print(sentence)
  print(verb_number)

NameError: ignored

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")

doc = nlp(final_text)

sentences_list = list(doc.sents)

for sentence in sentences_list:
  for token in sentence:
    if(token.pos_ == "VERB"):
      print("Verb: {}, POS: {}".format(token, token.pos_))
      print("neighboring token: {}".format(token.nbor()))
      print("is_ancestor: {}".format(token.is_ancestor()))
      print(list(token.subtree))
      raw_tokens = []
      for item in token.subtree:
        raw_tokens.append(item.text)
      raw_text = " ".join(raw_tokens)
      # print(raw_tokens)
      print(raw_text)
      print("_________________________________________________________________________")

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")

doc = nlp(final_text)

sentences_list = list(doc.sents)


for sentence in sentences_list:
  
  print("sentence: {}".format(sentence))

  subjects_list = []
  objects_list = []
  verbs_list = []

  for token in sentence:
    if(token.dep_ == "nsubj"):
      subjects_list.append(token)
    if(token.pos_ == "VERB"):
      verbs_list.append(token)
    if(token.dep_ == "dobj"):
      objects_list.append(token) 

  for subject in subjects_list:
    for verb in verbs_list:
      subject_is_ancestor_verb = verb.is_ancestor(subject)
      if(subject_is_ancestor_verb):
        print("verb: {}, subject: {}".format(verb,subject))

  for object in objects_list:
    for verb in verbs_list:
      object_is_ancestor_verb = verb.is_ancestor(object)
      if(object_is_ancestor_verb):
        print("verb: {}, object: {}".format(verb,object))

  print("____________________________________________________")

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")

doc = nlp(final_text)

sentences_list = list(doc.sents)

for sentence in sentences_list:
  for token in sentence:
    if(token.dep_ == "nsubj"):
      print(token, list(token.ancestors))
  print("____________________________________________________")

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")

doc = nlp(final_text)

sentences_list = list(doc.sents)

for sentence in sentences_list:
  for token in sentence:
    if(token.dep_ == "nsubj"):
      print(token, list(token.subtree))
  print("____________________________________________________")

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")

doc = nlp(final_text)

sentences_list = list(doc.sents)

for sentence in sentences_list:
  for token in sentence:
    if(token.dep_ == "dobj"):
      print(token, list(token.ancestors))
  print("____________________________________________________")

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")

doc = nlp(final_text)

sentences_list = list(doc.sents)

for sentence in sentences_list:
  for token in sentence:
    if(token.dep_ == "dobj"):
      print(token, list(token.subtree))
  print("____________________________________________________")

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")

doc = nlp(final_text)

sentences_list = list(doc.sents)

for sentence in sentences_list:
  for token in sentence:
    if(token.pos_ == "VERB"):
      print(token, list(token.children))
  print("____________________________________________________")

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")

doc = nlp(final_text)

sentences_list = list(doc.sents)

for sentence in sentences_list:
  for token in sentence:
    if(token.pos_ == "VERB"):
      print(token, list(token.subtree))
  print("____________________________________________________")