## Loading the API KEYs

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

## Load LLM model

Note: Just pick one of them

---

In [9]:
from langchain_openai import ChatOpenAI

model = ChatOpenAI(model = "gpt-3.5-turbo")

In [4]:
from langchain_google_genai import ChatGoogleGenerativeAI

model = ChatGoogleGenerativeAI(model = 'gemini-pro', temperature = 0.5)

In [2]:
from langchain_mistralai import ChatMistralAI

model = ChatMistralAI(model="mistral-large-latest", temperature=0.1)

In [21]:
# !aws configure
from langchain_aws import ChatBedrock

model = ChatBedrock(model="amazon.titan-text-express-v1" ,
    beta_use_converse_api=True)

In [37]:
from langchain_fireworks import ChatFireworks

model = ChatFireworks(model="accounts/fireworks/models/llama-v3p1-70b-instruct")

---

Test the model

In [38]:
model

ChatFireworks(client=<fireworks.client.chat_completion.ChatCompletionV2 object at 0x7e81d6c74b50>, async_client=<fireworks.client.chat_completion.ChatCompletionV2 object at 0x7e81d75cba90>, model_name='accounts/fireworks/models/llama-v3p1-70b-instruct', model_kwargs={}, fireworks_api_key=SecretStr('**********'))

In [39]:
model.invoke("Hello, how are you?")

AIMessage(content="Hello! I'm just a computer program, so I don't have feelings, but thanks for asking! How can I assist you today?", additional_kwargs={}, response_metadata={'token_usage': {'prompt_tokens': 21, 'total_tokens': 50, 'completion_tokens': 29}, 'model_name': 'accounts/fireworks/models/llama-v3p1-70b-instruct', 'system_fingerprint': '', 'finish_reason': 'stop', 'logprobs': None}, id='run-263eff07-2746-4110-8e2a-72762a71af4e-0', usage_metadata={'input_tokens': 21, 'output_tokens': 29, 'total_tokens': 50})

## Prompt template

In [40]:
from langchain.prompts import ChatPromptTemplate

template = """Classify the requirement text into one of the following categories by answering only with the category code (F, A, FT, L, LF, MN, O, PE, PO, SC, SE, or US):
- **F (Functional)**: Requirements detailing specific system functionalities or actions.
- **A (Availability)**: Requirements related to system uptime, accessibility, or continuous operation.
- **FT (Fault Tolerance)**: Requirements ensuring the system can handle errors or unexpected failures.
- **L (Legal)**: Requirements concerning compliance with laws, regulations, or industry standards.
- **LF (Look & Feel)**: Requirements about the appearance, design, color schemes, or visual style.
- **MN (Maintainability)**: Requirements on ease of updates, maintenance, or adjustments.
- **O (Operational)**: Requirements on system operations, such as supported platforms or environments.
- **PE (Performance)**: Requirements focused on system speed, response time, or resource efficiency.
- **PO (Portability)**: Requirements related to compatibility across various devices or platforms.
- **SC (Scalability)**: Requirements on the system's ability to handle growth in users or workload.
- **SE (Security)**: Requirements ensuring data security, authorization, or protection from threats.
- **US (Usability)**: Requirements focused on user-friendliness, ease of use, or intuitive design.

Here are some examples of each category:

{examples}

Now classify this new requirement:{requirment}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt.format(requirment="The user interface shall have standard menus  buttons for navigation", examples="LF:The application shall match the color of the schema set forth by Department of Homeland Security")

"Human: Classify the requirement text into one of the following categories by answering only with the category code (F, A, FT, L, LF, MN, O, PE, PO, SC, SE, or US):\n- **F (Functional)**: Requirements detailing specific system functionalities or actions.\n- **A (Availability)**: Requirements related to system uptime, accessibility, or continuous operation.\n- **FT (Fault Tolerance)**: Requirements ensuring the system can handle errors or unexpected failures.\n- **L (Legal)**: Requirements concerning compliance with laws, regulations, or industry standards.\n- **LF (Look & Feel)**: Requirements about the appearance, design, color schemes, or visual style.\n- **MN (Maintainability)**: Requirements on ease of updates, maintenance, or adjustments.\n- **O (Operational)**: Requirements on system operations, such as supported platforms or environments.\n- **PE (Performance)**: Requirements focused on system speed, response time, or resource efficiency.\n- **PO (Portability)**: Requirements re

Test prompt

In [41]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()
chain = prompt | model | parser
chain.invoke({
    "requirment": "The user interface shall have standard menus  buttons for navigation",
    "examples": "LF:The application shall match the color of the schema set forth by Department of Homeland Security"
}
)

'LF'

## Database

### load the data and format it for database

In [16]:
from langchain.document_loaders import TextLoader

# Load the file
loader = TextLoader("../data/train_data.txt")
text_documents = loader.load()  # This returns a list of Document objects

# Process the documents to reformat
formatted_text = []
for doc in text_documents:  # Iterate through the documents
    # Access the page_content attribute to get the main text
    content = doc.page_content
    for line in content.splitlines():  # Process each line in the document
        # Split the line by commas (assuming it’s CSV-like)
        parts = line.split(',', maxsplit=2)  # Assuming format: number, 'text', category
        if len(parts) == 3:
            category = parts[2].strip()  # Last part is the category
            text = parts[1].strip().strip("'")  # Middle part is the requirement text
            formatted_text.append(f"{category}:{text}")

# Join back into a single formatted string
formatted_output = "\n".join(formatted_text)

# Print or save the output
print(formatted_output)

with open("../data/text_split_format_train_data.txt", "w") as output_file:
    output_file.write(formatted_output)



A:Defect reports will be available to technical units on a 24x7 basis.
A:The system shall achieve 95% up time.
A:The website shall achieve 99.5% up time.
A:The WCS system shall be available at all times as long as IBM\92s internal intranet W3 is accessible. The WCS system will be available 95% of the time IBM\92s internal intranet W3 is available.
A:The MTBF (if any) should not be less than 2 months.
A:Programmers and application developers will have access to source code to address bugs or system enhancements as deemed necessary. Network Administrator and DBA support is also required to maintain a 24x7 system uptime.
A:The product shall adhere to the corporate online availability schedule.  The application is brought down only within 98% of the scheduled outages per the availability schedule.
A:The system shall have high availability every day of the year.The system must be available for use between 12:00AM and 6:00PM all days of the year.
A:The system shall be available 99.5% of the 

In [17]:
loader = TextLoader("../data/text_split_format_train_data.txt")
text_document = loader.load()

chunk the train data for dataset

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=10)
documents = text_splitter.split_documents(text_document)

### Load Test data

In [13]:
from langchain.document_loaders import TextLoader

# Load the file
loader = TextLoader("../data/test_data.txt")
text_documents = loader.load()  # This returns a list of Document objects

# Process the documents to reformat
formatted_text = []
for doc in text_documents:  # Iterate through the documents
    # Access the page_content attribute to get the main text
    content = doc.page_content
    for line in content.splitlines():  # Process each line in the document
        # Split the line by commas (assuming it’s CSV-like)
        parts = line.split(',', maxsplit=2)  # Assuming format: number, 'text', category
        if len(parts) == 3:
            category = parts[2].strip()  # Last part is the category
            text = parts[1].strip().strip("'")  # Middle part is the requirement text
            formatted_text.append(f"{category}:{text}")

# Join back into a single formatted string
formatted_output = "\n".join(formatted_text)

# Print or save the output
print(formatted_output)

with open("../data/text_split_format_test_data.txt", "w") as output_file:
    output_file.write(formatted_output)

A:The product shall be available 99% of the time. Rationale: To avoid service interruption during busiest customer service response periods. The product shall be available 99.99% of the time for regular business days.
A:The software is available for use from the supermarket opening time to the closing time.
A:The website shall be available for use 24 hours per day  365 days per year.
A:All movies shall be streamed on demand  at any time of the day.
A:The system shall be available for use between the hours of 8am and 6pm.
A:The RFS system should be available 24/7  especially during the budgeting period. The RFS system shall be available 90% of the time all year  and 98% during the budgeting period. 2% of the time  the system will become available within 1 hour of the time that the situation is reported.
A:Aside from server failure  the software product shall achieve 99.99% up time.
F:The system should keep the customer email.
F:The estimator shall not apply recycled parts to the collisi

#### Format Test data

In [13]:
import pandas as pd

# Read the file
file_path = '../data/text_split_format_test_data.txt'
with open(file_path, 'r') as file:
    lines = file.readlines()

# Initialize lists to store separated data
classes = []
texts = []

# Process each line
for line in lines:
    # Split the line into class and text parts
    if ':' in line:
        class_part, text_part = line.split(":",1)
        classes.append(class_part.strip())
        texts.append(text_part.strip())

# Create a DataFrame from the lists
import pandas as pd

test = pd.DataFrame({
    'class': classes,
    'text': texts,
})

# Display the first few rows to verify
print(test)
test.iloc[0]['text']

    class                                               text
0       A  The product shall be available 99% of the time...
1       A  The software is available for use from the sup...
2       A  The website shall be available for use 24 hour...
3       A  All movies shall be streamed on demand  at any...
4       A  The system shall be available for use between ...
..    ...                                                ...
193    US  The system shall be used by realtors with no t...
194    US  The system shall have a help system that offer...
195    US  The system must be intuitive and simple in the...
196    US  The product shall be easy for a realtor to learn.
197    US  The product shall be installed by an untrained...

[198 rows x 2 columns]


'The product shall be available 99% of the time. Rationale: To avoid service interruption during busiest customer service response periods. The product shall be available 99.99% of the time for regular business days.'

### Embedding train data

In [19]:
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
vectorstore = DocArrayInMemorySearch.from_documents(documents, embeddings)



## Save the train data to database(pinecone)

In [27]:
from langchain_pinecone import PineconeVectorStore

index_name = "re-index"

pinecone = PineconeVectorStore.from_documents(documents, embeddings, index_name=index_name)

### connect to pinecone

In [10]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
index_name = "re-index"
pinecone =PineconeVectorStore(index_name=index_name, embedding=embeddings)

## Chain

In [42]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup = RunnableParallel(
    examples=pinecone.as_retriever(), requirment=RunnablePassthrough()
)
chain = (
    setup
    |prompt
    |model
    |parser
)

output = chain.invoke("The user interface shall have standard menus  buttons for navigation")
output

'US'

In [43]:
import time

output_of_prompt = []

i = 0
while i != len(test):
    try:
        output = chain.invoke(f"{test.iloc[i]['text']}")
        output_of_prompt.append({'i':i,
                                'text':test.iloc[i]['text'],
                                'labels':test.iloc[i]['class'],
                                'prompt_output': output})

        time.sleep(2)
        i += 1
        print(f'{i}/{len(test)} done :)')
    except Exception as e:
        print(f'\n sleep time by 10 seconds for {i}')
        print("Error: ",e)
        time.sleep(10)

1/198 done :)
2/198 done :)
3/198 done :)
4/198 done :)
5/198 done :)
6/198 done :)
7/198 done :)
8/198 done :)
9/198 done :)
10/198 done :)
11/198 done :)
12/198 done :)
13/198 done :)
14/198 done :)
15/198 done :)
16/198 done :)
17/198 done :)
18/198 done :)
19/198 done :)
20/198 done :)
21/198 done :)
22/198 done :)
23/198 done :)
24/198 done :)
25/198 done :)
26/198 done :)
27/198 done :)
28/198 done :)
29/198 done :)
30/198 done :)
31/198 done :)
32/198 done :)
33/198 done :)
34/198 done :)
35/198 done :)
36/198 done :)
37/198 done :)
38/198 done :)
39/198 done :)
40/198 done :)
41/198 done :)
42/198 done :)
43/198 done :)
44/198 done :)
45/198 done :)
46/198 done :)
47/198 done :)
48/198 done :)
49/198 done :)
50/198 done :)
51/198 done :)
52/198 done :)
53/198 done :)
54/198 done :)
55/198 done :)
56/198 done :)
57/198 done :)
58/198 done :)
59/198 done :)
60/198 done :)
61/198 done :)
62/198 done :)
63/198 done :)
64/198 done :)
65/198 done :)
66/198 done :)
67/198 done :)
68/1

In [44]:
i = 0
while i != len(output_of_prompt):
    try: 
        if output_of_prompt[i]['prompt_output'][1] == ' ':
            # print(f"old: {output_of_prompt[i]['prompt_output'][0]}")
            output_of_prompt[i]['prompt_output'] = output_of_prompt[i]['prompt_output'][0]
            # print(f"new: {output_of_prompt[i]['prompt_output'][0]}")
        else:
            output_of_prompt[i]['prompt_output'] = output_of_prompt[i]['prompt_output'][0:2]
        i += 1
    except Exception as e:
        i += 1


In [45]:
output_of_prompt

[{'i': 0,
  'text': 'The product shall be available 99% of the time. Rationale: To avoid service interruption during busiest customer service response periods. The product shall be available 99.99% of the time for regular business days.',
  'labels': 'A',
  'prompt_output': 'A'},
 {'i': 1,
  'text': 'The software is available for use from the supermarket opening time to the closing time.',
  'labels': 'A',
  'prompt_output': 'A'},
 {'i': 2,
  'text': 'The website shall be available for use 24 hours per day  365 days per year.',
  'labels': 'A',
  'prompt_output': 'A'},
 {'i': 3,
  'text': 'All movies shall be streamed on demand  at any time of the day.',
  'labels': 'A',
  'prompt_output': 'A'},
 {'i': 4,
  'text': 'The system shall be available for use between the hours of 8am and 6pm.',
  'labels': 'A',
  'prompt_output': 'A'},
 {'i': 5,
  'text': 'The RFS system should be available 24/7  especially during the budgeting period. The RFS system shall be available 90% of the time all ye

In [46]:
pred_bad = []
label = []
data  = output_of_prompt
for d in data:
    pred_bad.append(d['prompt_output'])
    label.append(d['labels'])

## Post processing

get just category code from response

In [47]:
import re

pred = []

# Process each line
for line in pred_bad:
    # Remove \n and content inside parentheses using regular expressions
    cleaned_line = re.sub(r"\n", "", line)  # Remove newline characters
    cleaned_line = re.sub(r"\s*\(.*?\)", "", cleaned_line)  # Remove content in parentheses
    pred.append(cleaned_line.strip())  # Strip leading and trailing spaces

def extract_categories(pred, categories):
    """
    Extract valid categories from a list of strings.

    :param pred: List of strings containing text and categories
    :param categories: Set of valid categories
    :return: List of extracted valid categories, with 'Invalid' for unmatched items
    """
    filtered_data = []
    for item in pred:
        # Match any valid category as a standalone word
        match = re.findall(r'\b(' + '|'.join(categories) + r')\b', item)
        if match:
            # Add the first matched category (or all matches if needed)
            filtered_data.append(match[0])
        else:
            filtered_data.append("Invalid")  # Add 'Invalid' if no category is found
    return filtered_data


# List of valid categories
categories = {'F', 'A', 'FT', 'L', 'LF', 'MN', 'O', 'PE', 'PO', 'SC', 'SE', 'US'}

# Extract valid categories
filtered_data = extract_categories(pred, categories)

# Verify the lengths
print(f"Length of pred: {len(pred)}")
print(f"Length of filtered_data: {len(filtered_data)}")

# Output the result
pred = filtered_data


Length of pred: 198
Length of filtered_data: 198


In [48]:
pred_bad

['A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'SE',
 'F',
 'F',
 'F',
 'F',
 'LF',
 'PE',
 'F',
 'F',
 'F',
 'F',
 'F',
 'FT',
 'F',
 'F',
 'F',
 'F',
 'F',
 'SE',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'SE',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'SE',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'SE',
 'L',
 'F',
 'SE',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'F',
 'US',
 'F',
 'FT',
 'FT',
 'FT',
 'FT',
 'L',
 'L',
 'L',
 'LF',
 'LF',
 'F',
 'F',
 'F',
 'US',
 'LF',
 'LF',
 'LF',
 'US',
 'PO',
 'US',
 'MN',
 'MN',
 'MN',
 'O',
 'L',
 'O',
 'O',
 'F',
 'O',
 'O',
 'O',
 'PO',
 'LF',
 'A',
 'F',
 'F',
 'O',
 'O',
 'O',
 'PE',
 'PE',
 'PE',
 'PE',
 'SC',
 'PE',
 'O',
 'PE',
 'PE',
 'PE',
 'PE',
 'PE',
 'PE',
 'SC',
 'PO',
 'O',
 'PO',
 'PO',
 'SC',
 'SC',
 'SC',
 'SC',
 'SE',
 'SE',
 '

## Result

In [49]:
import numpy as np
from collections import defaultdict

def perf_measure(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    if len(y_true) != len(y_pred):
        raise ValueError("The lengths of y_true and y_pred must match.")

    # Get the unique classes
    classes = np.unique(np.concatenate((y_true, y_pred)))

    # Initialize dictionaries to store results
    metrics = {cls: {"TP": 0, "FP": 0, "TN": 0, "FN": 0} for cls in classes}

    for cls in classes:
        metrics[cls]["TP"] = np.sum((y_true == cls) & (y_pred == cls))
        metrics[cls]["FP"] = np.sum((y_true != cls) & (y_pred == cls))
        metrics[cls]["TN"] = np.sum((y_true != cls) & (y_pred != cls))
        metrics[cls]["FN"] = np.sum((y_true == cls) & (y_pred != cls))

    TP = sum(values["TP"] for values in metrics.values())
    FP = sum(values["FP"] for values in metrics.values())
    TN = sum(values["TN"] for values in metrics.values())
    FN = sum(values["FN"] for values in metrics.values())
    return TP, FP, TN, FN

TP, FP, TN, FN = perf_measure(label, pred)

accuracy = (TP + TN) / (TP + FP + TN + FN)
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
F1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print('Accuracy: ', accuracy)
print('Recall: ', recall)
print('Precision: ', precision)
print('F1-score: ', F1)

Accuracy:  0.9713804713804713
Recall:  0.8282828282828283
Precision:  0.8282828282828283
F1-score:  0.8282828282828283
