In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [3]:
from langchain_openai import ChatOpenAI

model = ChatOpenAI(openai_api_key = OPENAI_API_KEY, model = "gpt-3.5-turbo")


In [4]:
model.invoke("Hello, how are you?")

AIMessage(content="Hello! I'm just a computer program, so I don't have feelings, but I'm here and ready to assist you. How can I help you today?", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 33, 'prompt_tokens': 13, 'total_tokens': 46, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-d10185ca-cc60-4328-9062-c7b6a36365a6-0', usage_metadata={'input_tokens': 13, 'output_tokens': 33, 'total_tokens': 46, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [5]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser

chain.invoke("Hello, how are you?")

"Hello! I'm just a computer program, so I don't have feelings, but I'm here to help you. How can I assist you today?"

In [6]:
from langchain.prompts import ChatPromptTemplate

template = """Classify the requirement text into one of the following categories by answering only with the category code (F, A, FT, L, LF, MN, O, PE, PO, SC, SE, or US):
- **F (Functional)**: Requirements detailing specific system functionalities or actions.
- **A (Availability)**: Requirements related to system uptime, accessibility, or continuous operation.
- **FT (Fault Tolerance)**: Requirements ensuring the system can handle errors or unexpected failures.
- **L (Legal)**: Requirements concerning compliance with laws, regulations, or industry standards.
- **LF (Look & Feel)**: Requirements about the appearance, design, color schemes, or visual style.
- **MN (Maintainability)**: Requirements on ease of updates, maintenance, or adjustments.
- **O (Operational)**: Requirements on system operations, such as supported platforms or environments.
- **PE (Performance)**: Requirements focused on system speed, response time, or resource efficiency.
- **PO (Portability)**: Requirements related to compatibility across various devices or platforms.
- **SC (Scalability)**: Requirements on the system's ability to handle growth in users or workload.
- **SE (Security)**: Requirements ensuring data security, authorization, or protection from threats.
- **US (Usability)**: Requirements focused on user-friendliness, ease of use, or intuitive design.

Here are some examples of each category:

{examples}

Now classify this new requirement:{requirment}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt.format(requirment="The user interface shall have standard menus  buttons for navigation", examples="LF:The application shall match the color of the schema set forth by Department of Homeland Security")

"Human: Classify the requirement text into one of the following categories by answering only with the category code (F, A, FT, L, LF, MN, O, PE, PO, SC, SE, or US):\n- **F (Functional)**: Requirements detailing specific system functionalities or actions.\n- **A (Availability)**: Requirements related to system uptime, accessibility, or continuous operation.\n- **FT (Fault Tolerance)**: Requirements ensuring the system can handle errors or unexpected failures.\n- **L (Legal)**: Requirements concerning compliance with laws, regulations, or industry standards.\n- **LF (Look & Feel)**: Requirements about the appearance, design, color schemes, or visual style.\n- **MN (Maintainability)**: Requirements on ease of updates, maintenance, or adjustments.\n- **O (Operational)**: Requirements on system operations, such as supported platforms or environments.\n- **PE (Performance)**: Requirements focused on system speed, response time, or resource efficiency.\n- **PO (Portability)**: Requirements re

In [7]:
chain = prompt | model | parser
chain.invoke({
    "requirment": "The user interface shall have standard menus  buttons for navigation",
    "examples": "LF:The application shall match the color of the schema set forth by Department of Homeland Security"
}
)

'US'

In [8]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./data/nfr.txt")
text_document = loader.load()
text_document


[Document(metadata={'source': './data/nfr.txt'}, page_content='PE:The system shall refresh the display every 60 seconds.\nLF:The application shall match the color of the schema set forth by Department of Homeland Security\nUS: If projected  the data must be readable.  On a 10x10 projection screen  90% of viewers must be able to read Event / Activity data from a viewing distance of 30\nA: The product shall be available during normal business hours. As long as the user has access to the client PC  the system will be available 99% of the time during the first six months of operation.\nUS: If projected  the data must be understandable. On a 10x10 projection screen  90% of viewers must be able to determine that Events or Activities are occuring in current time from a viewing distance of 100\nSE:The product shall ensure that it can only be accessed by authorized users.  The product will be able to distinguish between authorized and unauthorized users in all access attempts\nUS:The product sh

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=10)
documents = text_splitter.split_documents(text_document)

In [10]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
# embedded_query = embeddings.embed_query

In [11]:
len(embeddings.embed_query("Who is Mary's sister?"))

1536

In [12]:
len(documents), documents[0]

(158,
 Document(metadata={'source': './data/nfr.txt'}, page_content='PE:The system shall refresh the display every 60 seconds.\nLF:The application shall match the color of the schema set forth by Department of Homeland Security\nUS: If projected  the data must be readable.  On a 10x10 projection screen  90% of viewers must be able to read Event / Activity data from a viewing distance of 30'))

In [13]:
from langchain_community.vectorstores import DocArrayInMemorySearch

vectorstore = DocArrayInMemorySearch.from_documents(documents, embeddings)



In [14]:
from langchain_pinecone import PineconeVectorStore

index_name = "re-index"

pinecone = PineconeVectorStore.from_documents(documents, embeddings, index_name=index_name)

  from tqdm.autonotebook import tqdm


In [15]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup = RunnableParallel(
    examples=pinecone.as_retriever(), requirment=RunnablePassthrough()
)
chain = (
    setup
    |prompt
    |model
    |parser
)

output = chain.invoke("The user interface shall have standard menus  buttons for navigation")

In [25]:
import pandas as pd

# Read the file
file_path = './data/test.txt'
with open(file_path, 'r') as file:
    lines = file.readlines()

# Initialize lists to store separated data
classes = []
texts = []

# Process each line
for line in lines:
    # Split the line into class and text parts
    if ':' in line:
        class_part, text_part = line.split(":",1)
        classes.append(class_part.strip())
        texts.append(text_part.strip())

# Create a DataFrame from the lists
import pandas as pd

test = pd.DataFrame({
    'class': classes,
    'text': texts,
})

# Display the first few rows to verify
print(test)

   class                                               text
0      F     The system shall display Events or Activities.
1      F  The display shall have two regions:  left 2/3 ...
2      F  The data displayed in both the nodes within th...
3      F  The table side of the display shall be split i...
4      F  The top 1/4 of the table will hold events that...
..   ...                                                ...
65    SE  The Disputes application shall manipulate and ...
66    SE  All updates to data files or database must be ...
67     L  The Disputes application must conform to the l...
68     L  All business rules specified in the Disputes S...
69     L  The Disputes application must conform to the l...

[70 rows x 2 columns]


In [22]:
test.iloc[0]['Text']

'The system shall display Events or Activities.'

In [26]:
import time

output_of_prompt = []

i = 0
while i != len(test):
    try:
        output = chain.invoke(f"{test.iloc[i]['text']}")
        output_of_prompt.append({'i':i,
                                'text':test.iloc[i]['text'],
                                'labels':test.iloc[i]['class'],
                                'prompt_output': output})

        time.sleep(2)
        i += 1
        print(f'{i}/{len(test)} done :)')
    except Exception as e:
        print(f'\n sleep time by 10 seconds for {i}')
        print("Error: ",e)
        time.sleep(10)

1/70 done :)
2/70 done :)
3/70 done :)
4/70 done :)
5/70 done :)
6/70 done :)
7/70 done :)
8/70 done :)
9/70 done :)
10/70 done :)
11/70 done :)
12/70 done :)
13/70 done :)
14/70 done :)
15/70 done :)
16/70 done :)
17/70 done :)
18/70 done :)
19/70 done :)
20/70 done :)
21/70 done :)
22/70 done :)
23/70 done :)
24/70 done :)
25/70 done :)
26/70 done :)
27/70 done :)
28/70 done :)
29/70 done :)
30/70 done :)
31/70 done :)
32/70 done :)
33/70 done :)
34/70 done :)
35/70 done :)
36/70 done :)
37/70 done :)
38/70 done :)
39/70 done :)
40/70 done :)
41/70 done :)
42/70 done :)
43/70 done :)
44/70 done :)
45/70 done :)
46/70 done :)
47/70 done :)
48/70 done :)
49/70 done :)
50/70 done :)
51/70 done :)
52/70 done :)
53/70 done :)
54/70 done :)
55/70 done :)
56/70 done :)
57/70 done :)
58/70 done :)
59/70 done :)
60/70 done :)
61/70 done :)
62/70 done :)
63/70 done :)
64/70 done :)
65/70 done :)
66/70 done :)
67/70 done :)
68/70 done :)
69/70 done :)
70/70 done :)


In [50]:
i = 0
while i != len(output_of_prompt):
    try: 
        if output_of_prompt[i]['prompt_output'][1] == ' ':
            # print(f"old: {output_of_prompt[i]['prompt_output'][0]}")
            output_of_prompt[i]['prompt_output'] = output_of_prompt[i]['prompt_output'][0]
            # print(f"new: {output_of_prompt[i]['prompt_output'][0]}")
        else:
            output_of_prompt[i]['prompt_output'] = output_of_prompt[i]['prompt_output'][0:2]
        i += 1
    except Exception as e:
        i += 1
    

In [51]:
output_of_prompt

[{'i': 0,
  'text': 'The system shall display Events or Activities.',
  'labels': 'F',
  'prompt_output': 'F'},
 {'i': 1,
  'text': 'The display shall have two regions:  left 2/3 of the display is graphical  right 1/3 of the display is a data table',
  'labels': 'F',
  'prompt_output': 'F'},
 {'i': 2,
  'text': 'The data displayed in both the nodes within the graph and the rows in the table are MSEL Summary data',
  'labels': 'F',
  'prompt_output': 'LF'},
 {'i': 3,
  'text': 'The table side of the display shall be split into 2 regions: sequential and temporal.',
  'labels': 'F',
  'prompt_output': 'F'},
 {'i': 4,
  'text': 'The top 1/4 of the table will hold events that are to occur sequentially.',
  'labels': 'F',
  'prompt_output': 'LF'},
 {'i': 5,
  'text': 'The bottom 3/4 of the table will hold events that occur according to its relevance to current time.',
  'labels': 'F',
  'prompt_output': 'F'},
 {'i': 6,
  'text': 'The system shall color code events according to their variance

In [68]:
pred = []
label = []
data  = output_of_prompt
for d in data:
    pred.append(d['prompt_output'])
    label.append(d['labels'])

In [74]:
pred

['F',
 'F',
 'LF',
 'F',
 'LF',
 'F',
 'F',
 'F',
 'LF',
 'LF',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'LF',
 'LF',
 'LF',
 'US',
 'F',
 'F',
 'O',
 'O',
 'F',
 'A',
 'F',
 'F',
 'SE',
 'F',
 'F',
 'F',
 'SE',
 'SE',
 'F',
 'F',
 'L',
 'L',
 'L']

In [75]:
import numpy as np
from collections import defaultdict

def perf_measure(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    if len(y_true) != len(y_pred):
        raise ValueError("The lengths of y_true and y_pred must match.")

    # Get the unique classes
    classes = np.unique(np.concatenate((y_true, y_pred)))

    # Initialize dictionaries to store results
    metrics = {cls: {"TP": 0, "FP": 0, "TN": 0, "FN": 0} for cls in classes}

    for cls in classes:
        metrics[cls]["TP"] = np.sum((y_true == cls) & (y_pred == cls))
        metrics[cls]["FP"] = np.sum((y_true != cls) & (y_pred == cls))
        metrics[cls]["TN"] = np.sum((y_true != cls) & (y_pred != cls))
        metrics[cls]["FN"] = np.sum((y_true == cls) & (y_pred != cls))

    TP = sum(values["TP"] for values in metrics.values())
    FP = sum(values["FP"] for values in metrics.values())
    TN = sum(values["TN"] for values in metrics.values())
    FN = sum(values["FN"] for values in metrics.values())
    return TP, FP, TN, FN

TP, FP, TN, FN = perf_measure(label, pred)

accuracy = (TP + TN) / (TP + FP + TN + FN)
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
F1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print('Accuracy: ', accuracy)
print('Recall: ', recall)
print('Precision: ', precision)
print('F1-score: ', F1)

Accuracy:  0.9428571428571428
Recall:  0.8
Precision:  0.8
F1-score:  0.8000000000000002
