<a href="https://colab.research.google.com/github/RajivDalal/RajivDalal/blob/main/notebooks/TestCaseNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spacy gensim
!pip install tabulate

In [9]:
!git clone https://github.com/Nidhig19/NLP.git

Cloning into 'NLP'...
remote: Enumerating objects: 84, done.[K
remote: Counting objects: 100% (84/84), done.[K
remote: Compressing objects: 100% (67/67), done.[K
remote: Total 84 (delta 17), reused 61 (delta 7), pack-reused 0 (from 0)[K
Receiving objects: 100% (84/84), 101.36 KiB | 12.67 MiB/s, done.
Resolving deltas: 100% (17/17), done.


In [32]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [33]:
import spacy, gensim
from tabulate import tabulate
from spacy import displacy
nlp = spacy.load('en_core_web_md')



# Data Preprocessing

In [10]:
#Import and read file
with open('NLP/data/sample.txt') as file:
    sample = file.read()
text = nlp(sample)
sentence_spans = list(text.sents)

In [11]:
for sentence in sentence_spans:
    print(sentence)

As a UI designer, I want to report to the Agencies about user testing, so that they are aware of their contributions to making Broker a better UX.

As a Researcher, I want an app that create proxy Data Packages for well know and reliable data, sources, so that I can load high quality data using Data Package tooling. 

As a participant, I want to change my estimate as long as the draw has not been completed, so that I can change my mind.

As a depositor, I want to have metadata automatically filled from other University systems and remembered from previous deposits, so that I don't have to waste time reentering the same information.



In [12]:
#Removing punctuations, stop words, whitespaces
sentence_tokens=[]
for i,sentence in enumerate(sentence_spans):
    filtered_text=[token for token in sentence if not token.is_punct and not token.is_stop and not token.is_space]
    sentence_tokens.append(filtered_text)
    print(i+1,filtered_text)

1 [UI, designer, want, report, Agencies, user, testing, aware, contributions, making, Broker, better, UX]
2 [Researcher, want, app, create, proxy, Data, Packages, know, reliable, data, sources, load, high, quality, data, Data, Package, tooling]
3 [participant, want, change, estimate, long, draw, completed, change, mind]
4 [depositor, want, metadata, automatically, filled, University, systems, remembered, previous, deposits, waste, time, reentering, information]


# Input, Action and Condition Words


In [35]:
from spacy.tokens import Doc

In [62]:
def custom_dep_tree(span, main_verb_index):
  words = [token.text for token in span]
  new_doc = Doc(span.vocab, words=words)

  for i, token in enumerate(span):
    new_doc[i].pos_ = token.pos_
    new_doc[i].tag_ = token.tag_
    new_doc[i].dep_ = token.dep_
    head_index = min(token.head.i - span[0].i, len(new_doc) - 1)
    new_doc[i].head = new_doc[head_index]

  if main_verb_index is not None:
    # Set the main verb as root
    new_doc[main_verb_index].dep_ = "ROOT"
    new_doc[main_verb_index].head = new_doc[main_verb_index]

  # Adjust dependencies for other tokens
  for token in new_doc:
      if token.i != main_verb_index and token.dep_ == "ROOT":
          token.dep_ = "dep"
          token.head = new_doc[main_verb_index]

  return new_doc
res = []
for id,tokens in enumerate(sentence_tokens):
  main_verb_index = None
  results = {}
  for i, token in enumerate(tokens):

    if token.lemma_ == "want":
      for j, next_token in enumerate(tokens[i:], start=i):
        if (next_token.pos_ == "VERB" and
          next_token.lemma_ not in ["want", "be", "have"] and
          not next_token.dep_ == "aux"):
          main_verb_index = j
          break

      if main_verb_index:
        break

  if main_verb_index is not None:

    main_verb_text = tokens[main_verb_index].text
    orig_main_verb_index = next(
        (i for i, token in enumerate(sentence_spans[id])
         if token.text == main_verb_text),
        None
    )

    modified_doc = custom_dep_tree(sentence_spans[id], orig_main_verb_index)
    main_verb = modified_doc[orig_main_verb_index]
    # print(main_verb)

    role = None
    for token in tokens:
      if token.text.lower() == "as" and token.i + 2 < len(tokens):
        role = tokens[token.i + 2].text
        break

  results = {
          'main_action': main_verb.text,
          'role': role or "Unknown",
          'original_text': sentence_spans[id],
          'filtered_text':tokens,
          'main_verb_index': main_verb_index,
          'dependencies': [(token.text, token.dep_, token.head.text)
                          for token in modified_doc if token.i != main_verb_index]
      }
  res.append(results)
  print(results)

{'main_action': 'report', 'role': 'Unknown', 'original_text': As a UI designer, I want to report to the Agencies about user testing, so that they are aware of their contributions to making Broker a better UX.
, 'filtered_text': [UI, designer, want, report, Agencies, user, testing, aware, contributions, making, Broker, better, UX], 'main_verb_index': 3, 'dependencies': [('As', 'prep', 'want'), ('a', 'det', 'designer'), ('UI', 'compound', 'designer'), (',', 'punct', 'want'), ('I', 'nsubj', 'want'), ('want', 'dep', 'report'), ('to', 'aux', 'report'), ('report', 'ROOT', 'report'), ('to', 'prep', 'report'), ('the', 'det', 'Agencies'), ('Agencies', 'pobj', 'to'), ('about', 'prep', 'report'), ('user', 'compound', 'testing'), ('testing', 'pobj', 'about'), (',', 'punct', 'want'), ('so', 'mark', 'are'), ('that', 'mark', 'are'), ('they', 'nsubj', 'are'), ('are', 'advcl', 'want'), ('aware', 'acomp', 'are'), ('of', 'prep', 'aware'), ('their', 'poss', 'contributions'), ('contributions', 'pobj', 'of'

# Dependency Parsing

In [None]:
for i,tokens in enumerate(sentence_tokens):
    data = []
    #displacy.render(sentence_spans[i], style="dep")
    for token in tokens:
        data.append([token.text, token.lemma_, token.pos_, token.tag_, token.dep_])

    # Print the table
    headers = ["Text", "Lemma", "POS", "Tag", "Dependency"]
    print(tabulate(data, headers=headers, tablefmt="fancy_grid"))

In [None]:
import spacy

def extract_main_actions(user_stories):
    nlp = spacy.load("en_core_web_sm")
    results = {}

    for story in user_stories:
        doc = nlp(story)

        # Find the "want to" pattern and get the main verb that follows
        main_verb = None
        for i, token in enumerate(doc):
            # Look for the pattern "want to [VERB]" or "want [VERB]"
            if token.lemma_ == "want":
                # Look ahead for the main verb
                for next_token in doc[i:]:
                    if (next_token.pos_ == "VERB" and
                        next_token.lemma_ not in ["want", "be", "have"] and
                        not next_token.dep_ == "aux"):
                        main_verb = next_token
                        break

        if main_verb:
            # Find the role (usually after "As a")
            role = None
            for token in doc:
                if token.text.lower() == "as" and token.i + 2 < len(doc):
                    role = doc[token.i + 2].text
                    break

            # Get the surrounding context
            context = {
                'main_action': main_verb.lemma_,
                'original_text': story,
                'role': role or "Unknown",
                'object': [token.text for token in main_verb.children if token.dep_ in ["dobj", "pobj"]],
                'sentence_structure': [(token.text, token.dep_, token.pos_) for token in doc]
            }
            results[story] = context

    return results

# Example usage
user_stories = [
    "As a UI designer, I want to report to the Agencies about user testing, so that they are aware of their contributions to making Broker a better UX.",
    "As a Researcher, I want an app that create proxy Data Packages for well know and reliable data, sources, so that I can load high quality data using Data Package tooling.",
    "As a participant, I want to change my estimate as long as the draw has not been completed, so that I can change my mind.",
    "As a depositor, I want to have metadata automatically filled from other University systems and remembered from previous deposits, so that I don't have to waste time reentering the same information."
]

# Extract and print results
results = extract_main_actions(user_stories)

# Print formatted results
for story, info in results.items():
    print("\nUser Story Analysis:")
    print(f"Role: {info['role']}")
    print(f"Main Action: {info['main_action']}")
    print(f"Object: {', '.join(info['object']) if info['object'] else 'None'}")
    print("-" * 50)

In [None]:
uid = 1
for tokens in sentence_tokens:
    print(tokens)
    action = ""
    inputs = []
    condns = []
    for token in tokens:
        if token.dep_ in ["xcomp","relcl"] and token.head.dep_ == "ROOT":
            action = token.text
            break

        if token.dep_ in ["dobj","pobj","attr"]:
            inputs.append(token.text)
        if token.dep_ == "compound" and token.head.dep_ in ["dobj","pobj","attr"]:
            inputs.append(token.text+" "+token.head.text)

        if token.dep_ in ["advcl","ccomp"]:
            condns.append(" ".join([child.text for child in token.subtree]))

    result = {
        "user_story_id": uid,
        "action": action,
        "inputs": inputs,
        "conditions": condns,
    }
    print(result,"\n")
    uid+=1

[UI, designer, want, report, Agencies, user, testing, aware, contributions, making, Broker, better, UX]
{'user_story_id': 1, 'action': 'report', 'inputs': ['UI designer', 'designer'], 'conditions': []} 

[Researcher, want, app, create, proxy, Data, Packages, know, reliable, data, sources, load, high, quality, data, Data, Package, tooling]
{'user_story_id': 2, 'action': '', 'inputs': ['Researcher', 'app', 'proxy Packages', 'Data Packages', 'Packages', 'know', 'data', 'quality data', 'data', 'Package tooling', 'tooling'], 'conditions': ['so that I can load high quality data using Data Package tooling']} 

[participant, want, change, estimate, long, draw, completed, change, mind]
{'user_story_id': 3, 'action': 'change', 'inputs': ['participant'], 'conditions': []} 

[depositor, want, metadata, automatically, filled, University, systems, remembered, previous, deposits, waste, time, reentering, information]
{'user_story_id': 4, 'action': '', 'inputs': ['depositor', 'metadata', 'University s

# Test Case Generation using Llama

In [None]:
!pip install accelerate bitsandbytes transformers
!pip install torch
!pip install python-dotenv



In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")

In [None]:
import torch
from transformers import AutoTokenizer,AutoModelForCausalLM, BitsAndBytesConfig, pipeline
