# Analysing the redundancies of User Stories - Just the use Stories

In [35]:
import locale
import os
from itertools import combinations

import pandas as pd
import tiktoken
from openpyxl import Workbook, load_workbook
from openpyxl.styles import Alignment, Font
from openpyxl.utils import get_column_letter as utils_get_column_letter
#from openpyxl.worksheet.worksheet import Worksheet

from support_functions.excel_helper import save_to_excel
from support_functions.load_data import load_datasets_with_out_annotations as loading
from support_functions.request_handler import process_user_stories, process_user_stories_parallel
from support_functions.time_recorder import TimeRecorder

In [36]:
locale.setlocale(locale.LC_ALL, 'de_DE') 
MODEL_VERSION_NAME = os.getenv('MODEL_VERSION')
THREADING = bool(int(os.getenv('THREADING')))
print(f"Threading is used: {THREADING}")
print(MODEL_VERSION_NAME)

Threading is used: False
gpt-3.5-turbo


In [37]:
time_recorders: dict = {}
time_recorder: TimeRecorder = None

### Load all User Stories from the different packages

In [38]:
datasets: dict[str, list] = loading()

### Datatransformation to panda dataframes and numpy arrarys. 

In [39]:
def transform_data_id_text(data: list):
    rows = []
    for entry in data:
        row = {
            'ID': entry['id'],
            'User Story': entry['text']
        }
        rows.append(row)
    return pd.DataFrame(rows)

In [40]:
# n(n-1) / 2
def transform_pairwise(df: pd.DataFrame):
    rows = []
    for i, j in combinations(range(df.shape[0]), 2):
        row = {
            'First ID': df.iloc[i, 0],
            'First User Story': df.iloc[i, 1],
            'Second ID': df.iloc[j, 0],
            'Second User Story': df.iloc[j, 1]
        }
        rows.append(row)
    return pd.DataFrame(rows)

In [41]:
def template_request_two_user_stories(
    current_message: list[dict],
    idx: int,
    pairs: pd.DataFrame,
):
    user_story_one: str = pairs.iat[idx, 1]
    user_story_two: str = pairs.iat[idx, 3]
    user_story_one_id: str = str(pairs.iat[idx, 0])
    user_story_two_id: str = str(pairs.iat[idx, 2])
    request: dict = {
        "role": "user",
        "content": "Yes. Please, process the following pair of user stories:\n"
        f"id: {user_story_one_id}, describtion:  {user_story_one}\n"
        f"id: {user_story_two_id}, describtion: {user_story_two}",
    }
    current_message.append(request)

In [42]:
def sort_threaded_results(to_sort: dict) -> None:
    for _ in to_sort.values():
        sorted(_, key=lambda x: (int(x['relatedStories'][0]), int(x['relatedStories'][1])))

## Request definition for User Stories

In [43]:
definition_main_part: str = "The main part of a User Story describes the core action that a persona wishes to accomplish. For example, the main part of the user story 'As a UI designer, I want to begin user testing, so that I can validate stakeholder UI improvement requests' is 'As a UI designer, I want to begin user testing'. This part specifies the main functionality or feature the story aims to deliver."
definition_benefit: str = "A benefit in a User Story refers to the positive outcome or advantage that a user or stakeholder gains from the functionality described in the User Story. It directly addresses user needs and enhances the value of the software product, providing a clear motive for the user action, like easing a task or improving efficiency. We use the example from before which was 'As a UI designer, I want to begin user testing, so that I can validate stakeholder UI improvement requests'. Here we determine the benefit part, which is 'so that I can validate stakeholder UI improvement request'. Hence, a benefit starts with the phrase 'so that'."
definition_general_redundancy: str = "Redundancy in a User Story refers to the unnecessary repetition or duplication of information within the story's description. Such redundancies can cloud the core purpose or value of the feature, leading to potential confusion or misinterpretation by the development team. It's crucial to remove redundancies to maintain clarity and focus in development."
definition_main_part_redundancy: str = "A main part redundancy in User Stories refers to unnecessary duplication in the description of the action or functionality that the persona aims to achieve, excluding persona repetitions and benefits. This type of redundancy occurs when similar actions or functionalities are described in multiple stories without adding distinct value or clarity, potentially leading to confusion or inefficiencies in the development process. Thus, a main part redundancy occurs when a pair of two User Stories have a redundant main part."
definition_benefit_redundancy: str = "Benefit redundancy in User Stories occurs when multiple stories repeatedly emphasize the same positive outcomes of similar software features. This duplication can lead to inefficiencies as the same advantages are stated multiple times, potentially diluting the impact and clarity of user benefits in the product documentation. Thus, a benefit redundancy occurs when a pair of two User Stories have a redundant benefit part."

In [44]:
### Maybe add a field for explanation
### Define that the fist entry of the referenceToOriginalText has to be from the first user story and the second from the second
### Add a description why it is redundant

definition_json_format_pair_user_stories: str = '''
{
    "relatedStories": [1, 3],
    "redundantMainPart": true, 
    "mainPartRedundancies": [
        {
            "reasonDescription": "The redundancy arises from the repetition of the concept of user verification expressed in slightly different ways for emphasis or to cater to different audiences.",
            "referenceToOriginalText": ["As a User, I want to verify myself", "As an application User, I want to verify that I am who I am"]
        },
        {
            "reasonDescription": "The redundancy here stems from the repetition of the testing action and the tested entity, expressed in slightly different ways, likely for emphasis or to accommodate different perspectives",
            "referenceToOriginalText": ["As a software tester, I want to begin user testing", "As a tester, I want to begin testing the software"]
        }
        // Additional main part redundancies can be added here
    ],
    "redundantBenefit": true, 
    "benefitRedundancies": [
        {
            "reasonDescription": "The redundancy in these sentences lies in the repetition of the action ("login") and the destination ("webpage" and "website"), which convey the same idea using slightly different wording.",
            "referenceToOriginalText": ["I can login into the webpage.", "I could login into the website"]
        },
        {
            "reasonDescription": "The redundancy occurs due to the repetition of the action ('print') expressed in slightly different ways, conveying the same meaning of initiating the printing process.",
            "referenceToOriginalText": ["I can print a document", "I can give the order to print"]
        }
        // Additional benefit redundancies can be added here
    ]
}
'''

### Request definition to ChatGPT

In [45]:
#Collection of messages -> In future it can be used to create a context aware system
message: list[dict] = []

#Defining initial user request
user_inital_message: dict = {
  "role": "user",
  "content": "Act as a Requirements Engineer focused on identifying redundancies. Please review pairs of two User Stories and pinpoint any unnecessary duplications that obscure clarity or add no distinct value."
}
message.append(user_inital_message)

# This was generated by ChatGPT thus it is marked as assistent
system_is_r_eng = ("As a requirements engineer in agile development, it is my responsibility to review user stories for redundancies. My goal is to identify and report any overlapping or duplicate requirements." 
" By carefully analysing the user stories in depth, I ensure that each requirement is necessary and contributes uniquely, increasing the coherence of the product.")
assistent_initial_message: str = {"role": "system", "content": system_is_r_eng}
message.append(assistent_initial_message)

#Defining the definition of a main part and benefit redundancy
user_defining_benefit = {"role": "user", "content": "Please, analyse redundancies in the main part and benefit of a pair of two given User Stories"
    " Note that a User Story may include multiple redundancies in the main part as well as the benefit. The redundancies of the main part and benefit are disjoint sets."
    " Hence, a main part can be redundant while a benefit is not and vice versa. However, in some cases the main part and the benefit can be at the same time redundant, but they do not depend on each other and therefore they are independent redundant." 
    " The definition of the main part is: "
    f"\"{definition_main_part}\" and the definition of a benefit is as follows: \"{definition_benefit}\""}
message.append(user_defining_benefit)

# This was generated by ChatGPT thus it is marked as assistent
system_is_r_eng_defintion_aware: str = ("I will analyse redundancies in the main parts and benefits of two User Stories. Each story might include multiple redundancies."
" The main part typically describes the desired functionality by the persona, while the benefit details the positive outcomes from the functionality.")
message.append({"role":"system","content":system_is_r_eng_defintion_aware})

#Defining what a redundancy is
user_defining_redundancy: dict = {"role": "user", "content":f"Please analyse the User Stories considering the general definition of redundancy: {definition_general_redundancy} "
    f"Additionally, take into account the specific redundancy in the main part: \"{definition_main_part_redundancy}\" "
    f"and the redundancy in benefits: \"{definition_benefit_redundancy}\" Can you do that?"}
message.append(user_defining_redundancy)

# This was generated by ChatGPT thus it is marked as assistant
system_is_r_eng_defintion_aware_redundancy: str = "I'll review the User Stories for redundancies in main parts and benefits, using the specified definitions. Ready to proceed?"
message.append({"role":"system", "content": system_is_r_eng_defintion_aware_redundancy})

#Going to the formating of the output
user_defining_return_json_format = {"role":"user", "content": f"Before we proceed, consider the following JSON output format: {definition_json_format_pair_user_stories}."
                                    +     " This JSON structure organizes information about redundancies in User Stories, focusing on both the main parts and the benefits."
                                    +     " Each section includes descriptions of the redundancies and specific text references to illustrate where these redundancies occur within the stories."
                                    + "1.) The field 'relatedStories' is an integer value. The value are the ids of the User Stories and this field is also mandatory. \n" 
                                    + "2.) The field 'redundantMainPart' can be true or false. true indicates that a pair of User Stories has at least one main part redundancy and otherwise false. It is a mandatory field. \n" 
                                    + "3.) The array 'mainPartRedundancies' contains no, one or more objects, each representing a specific redundancy found within the main parts of User Stories."
                                    +     " When no main part redundancy is found, it contains no objects and thus it is empty."
                                    +     " In the case only one main part redundancy is found, it contains one entry."
                                    +     " The last case is that multiple main part redundancies are found. Then it contains more than one entry."
                                    +     " This array is mandatory to be filled when the field 'redundantMainPart' is true. \n" 
                                    + "3.1.) The field 'redundancyDescription' should contain a brief description, typically a sentence or two, explaining the reason for the main part redundancy. \n"
                                    + "3.2.) The array 'referenceToOriginalText' holds the exact text from the User Story pairs that demonstrates the redundancy.  Thus, the array has two entries."
                                    +     " The first entry is from the first user story and the second entry is from the user story. \n"
                                    + "4.) The field 'redundantbenefit' can be true or false. true indicates that a pair of User Stories has at least one benefit redundancy and otherwise false. It is a mandatory field. \n"
                                    + "5.) The array 'benefitRedundancies' is similar to 'mainPartRedundancies' but contain not the same content."
                                    +     " The main part and benefit redundancies are disjoint, therefore a benefit redundancy can exist while a main part redundancy does not exist or exists."
                                    +     " This array details redundancies found in the benefits described by the User Story pairs. This array is mandatory to be filled when the field 'redundantBenefit' is true.  \n" 
                                    + "5.1) The field 'redundancyDescription' should contain a brief description, typically a sentence or two, explaining the reason for the benefit redundancy. \n"
                                    + "5.2.) The array 'referenceToOriginalText' lists the text from the stories that highlight the redundant benefits, helping to pinpoint where exactly these redundancies occur."
                                    +     "Thus, the array has two entries. The first entry is from the first user story and the second entry is from the user story."
                                    +     " When no main part redundancy is found, it contains no objects and thus it is empty."
                                    +     " In the case only one main part redundancy is found, it contains one entry."
                                    +     " The last case is that multiple main part redundancies are found. Then it contains more than one entry."
                                    +     " This array is mandatory to be filled when the field 'redundantMainPart' is true. \n"                                 
                                    + "Are you ready to analyse redundancies in User Stories, focusing on main parts and benefits, and return the findings in JSON format?"}
message.append(user_defining_return_json_format)

#Defining that ChatGPT can do it
message.append({"role":"system", "content": "I've noted the JSON output format specified. I am ready to analyse redundancies in User Stories, focusing on the main parts and benefits. Shall we begin?"})


In [46]:
if True:
    message_text: str = ""
    for key in message:
        message_text += key["content"]
        print(key["content"])
    enc = tiktoken.get_encoding('cl100k_base')
    token_size = enc.encode(message_text)
    print("-" * 3 + "Token" + "-" * 3)
    print(token_size)
    print("-" * 3 + "Sum of Tokens" + "-" * 3)
    # The output is not correct as the result from the online pages differs https://platform.openai.com/tokenizer
    print("The total token sum is: " + locale.format_string("%d", sum(token_size), grouping=True))
    

Act as a Requirements Engineer focused on identifying redundancies. Please review pairs of two User Stories and pinpoint any unnecessary duplications that obscure clarity or add no distinct value.
As a requirements engineer in agile development, it is my responsibility to review user stories for redundancies. My goal is to identify and report any overlapping or duplicate requirements. By carefully analysing the user stories in depth, I ensure that each requirement is necessary and contributes uniquely, increasing the coherence of the product.
Please, analyse redundancies in the main part and benefit of a pair of two given User Stories Note that a User Story may include multiple redundancies in the main part as well as the benefit. The redundancies of the main part and benefit are disjoint sets. Hence, a main part can be redundant while a benefit is not and vice versa. However, in some cases the main part and the benefit can be at the same time redundant, but they do not depend on each 

## Data processing for: G02 federal funding

In [47]:
ID_G02: str = "#G02#"
data_g02: list = datasets[ID_G02]
df_g02: pd.DataFrame = transform_data_id_text(data_g02)
df_g02_pairs = transform_pairwise(df_g02)

In [48]:
display(df_g02)

Unnamed: 0,ID,User Story
0,215,"As a Data user, I want to have the 12-19-2017 ..."
1,216,"As a UI designer, I want to redesign the Resou..."
2,217,"As a UI designer, I want to report to the Agen..."
3,218,"As a UI designer, I want to move on to round 2..."
4,220,"As a UI designer, I want to move on to round 3..."
...,...,...
90,308,"As a FABS user, I want to have my validations ..."
91,309,"As a FABS user, I want to see correct status l..."
92,310,"As an agency user, I want to know when the sub..."
93,311,"As an agency user, I want a landing page to na..."


In [49]:
display(df_g02_pairs)

Unnamed: 0,First ID,First User Story,Second ID,Second User Story
0,215,"As a Data user, I want to have the 12-19-2017 ...",216,"As a UI designer, I want to redesign the Resou..."
1,215,"As a Data user, I want to have the 12-19-2017 ...",217,"As a UI designer, I want to report to the Agen..."
2,215,"As a Data user, I want to have the 12-19-2017 ...",218,"As a UI designer, I want to move on to round 2..."
3,215,"As a Data user, I want to have the 12-19-2017 ...",220,"As a UI designer, I want to move on to round 3..."
4,215,"As a Data user, I want to have the 12-19-2017 ...",221,"As a Developer , I want to be able to log bett..."
...,...,...,...,...
4460,309,"As a FABS user, I want to see correct status l...",311,"As an agency user, I want a landing page to na..."
4461,309,"As a FABS user, I want to see correct status l...",312,"As an agency user, I want to submit my data el..."
4462,310,"As an agency user, I want to know when the sub...",311,"As an agency user, I want a landing page to na..."
4463,310,"As an agency user, I want to know when the sub...",312,"As an agency user, I want to submit my data el..."


### Process and Store the User Stories persistently

In [50]:
time_recorder = TimeRecorder()
time_recorders[ID_G02] = time_recorder
if THREADING:
    process_user_stories_parallel(message, df_g02_pairs, ID_G02, MODEL_VERSION_NAME, time_recorder=time_recorder)
    print(time_recorder.milliseconds)
else:
    process_user_stories(message, df_g02_pairs, ID_G02, MODEL_VERSION_NAME, time_recorder=time_recorder)

## Data processing for: G03 loudoun

In [51]:
ID_G03: str = "#G03#"
data_g03: list = datasets[ID_G03]
df_g03: pd.DataFrame = transform_data_id_text(data_g03)
df_g03_pairs = transform_pairwise(df_g03)

In [52]:
df_g03.head()

Unnamed: 0,ID,User Story
0,315,"As a Public User, I want to Search for Informa..."
1,316,"As a ProspectiveApplicant, I want to research ..."
2,317,"As an Applicant, I want to Request PreApplicat..."
3,318,"As a Customer, I want to Create a Customer Por..."
4,319,"As an Applicant, I want to Submit Application,..."


In [53]:
display(df_g03_pairs)

Unnamed: 0,First ID,First User Story,Second ID,Second User Story
0,315,"As a Public User, I want to Search for Informa...",316,"As a ProspectiveApplicant, I want to research ..."
1,315,"As a Public User, I want to Search for Informa...",317,"As an Applicant, I want to Request PreApplicat..."
2,315,"As a Public User, I want to Search for Informa...",318,"As a Customer, I want to Create a Customer Por..."
3,315,"As a Public User, I want to Search for Informa...",319,"As an Applicant, I want to Submit Application,..."
4,315,"As a Public User, I want to Search for Informa...",320,"As an Applicant, I want to Submit Supporting D..."
...,...,...,...,...
1591,369,"As a County Staff Member, I want to create a c...",371,"As a Technical Staff member, I want to update ..."
1592,369,"As a County Staff Member, I want to create a c...",372,"As a Staff member, I want to Send or Post Cita..."
1593,370,"As a Staff member, I want to create each condo...",371,"As a Technical Staff member, I want to update ..."
1594,370,"As a Staff member, I want to create each condo...",372,"As a Staff member, I want to Send or Post Cita..."


### Process and Store the User Stories persistently

In [54]:
time_recorder = TimeRecorder()
time_recorders[ID_G03] = time_recorder
if THREADING:
    process_user_stories_parallel(message, df_g03_pairs, ID_G03, MODEL_VERSION_NAME, time_recorder=time_recorder)
else:
    process_user_stories(message, df_g03_pairs, ID_G03, MODEL_VERSION_NAME, time_recorder=time_recorder)

## Data processing for: G04 recycling

In [55]:
ID_G04: str = "#G04#"
data_g04: list = datasets[ID_G04]
df_g04: pd.DataFrame = transform_data_id_text(data_g04)
df_g04_pairs = transform_pairwise(df_g04)

In [56]:
df_g04.head()

Unnamed: 0,ID,User Story
0,164,"As a user, I want to click on the address, so ..."
1,165,"As a user, I want to be able to anonymously vi..."
2,166,"As a user, I want to be able to enter my zip c..."
3,167,"As a user, I want to be able to get the hours ..."
4,168,"As a user, I want to have a flexible pick up t..."


In [57]:
display(df_g04_pairs)

Unnamed: 0,First ID,First User Story,Second ID,Second User Story
0,164,"As a user, I want to click on the address, so ...",165,"As a user, I want to be able to anonymously vi..."
1,164,"As a user, I want to click on the address, so ...",166,"As a user, I want to be able to enter my zip c..."
2,164,"As a user, I want to click on the address, so ...",167,"As a user, I want to be able to get the hours ..."
3,164,"As a user, I want to click on the address, so ...",168,"As a user, I want to have a flexible pick up t..."
4,164,"As a user, I want to click on the address, so ...",169,"As a user, I want to be able to select differe..."
...,...,...,...,...
1270,211,"As a user, I want to be able to browse through...",213,"As a recyclingfacility representative, I want ..."
1271,211,"As a user, I want to be able to browse through...",214,"As a recyclingfacility, I want to be able to c..."
1272,212,"As a recyclingfacility representative, I want ...",213,"As a recyclingfacility representative, I want ..."
1273,212,"As a recyclingfacility representative, I want ...",214,"As a recyclingfacility, I want to be able to c..."


### Process and Store the User Stories persistently

In [58]:
time_recorder = TimeRecorder()
time_recorders[ID_G04] = time_recorder
if THREADING:
    process_user_stories_parallel(message, df_g04_pairs, ID_G04, MODEL_VERSION_NAME, time_recorder=time_recorder)
else:
    process_user_stories(message, df_g04_pairs, ID_G04, MODEL_VERSION_NAME, time_recorder=time_recorder)

## Save total Speeds

In [59]:
SHEET_NAME = 'Time Consup. Anlys.S.'

In [60]:
base_path: str = os.getcwd()
path_to_file = os.path.join(base_path, "results")
path_to_file = os.path.join(path_to_file, "redundancy-model-" + MODEL_VERSION_NAME)

count_runs_per_data_set: dict[str, int] = {}
for key in time_recorders.keys():
        idx = 0
        _ =  f"{idx:02d}_{key}.json"
        while os.path.exists(os.path.join(path_to_file, f"{idx:02d}_{key}.json")):
                idx += 1
        count_runs_per_data_set[key] = idx

entries: list[tuple[int,str,str, int, float, float, float]] = []

entry: tuple[int,str,str, int, float, float, float] = None
for key, value in time_recorders.items():
        entry = (key, count_runs_per_data_set[key], MODEL_VERSION_NAME, str(THREADING).lower(), 
                 value.nanoseconds, value.milliseconds, value.seconds, value.minutes)
        entries.append(entry)

columns = ['Dataset', 'Run Count', 'Model Version', 'Threading Enabled', 
           'Nanoseconds', 'Milliseconds', 'Seconds', 'Minutes']

_file_path: str = os.path.join(base_path, os.getenv('OUTPUT_EXCEL_NAME_WITHOUT_ANNOTATIONS'))
check = os.path.exists(_file_path)

old_time_consuption_data: pd.DataFrame = None
if check:
        try:
                old_time_consuption_data = pd.read_excel(_file_path, SHEET_NAME)
        except ValueError:
                check = False
           
time_consumption_data: pd.DataFrame = pd.DataFrame(entries, columns=columns)

if check and not old_time_consuption_data.empty:
        # Checking if in the excel is already the data. Case: this code is executed twice for the same data
        old_time_consuption_data = old_time_consuption_data.dropna()
        for idx in range(len(old_time_consuption_data)):
                condition = time_consumption_data[
                        (time_consumption_data['Dataset'] == old_time_consuption_data.iat[idx, 0]) & 
                        (time_consumption_data['Run Count'] == old_time_consuption_data.iat[idx, 1])
                ].index
                time_consumption_data = time_consumption_data.drop(condition).reset_index(drop=True)
        time_consumption_data = pd.concat([old_time_consuption_data, time_consumption_data]).reset_index(drop=True)

In [61]:
def formatter_time(wb: Workbook, sheet_name: str): 
    ws = wb[sheet_name]
    header_font = Font(size=14, bold=True)
    for cell in ws["1:1"]:
        cell.font = header_font
    
    ADDITIONAL_LENGTH: int = 0
    ADJUSTED_WIDTH: int = 0
    MAX_LEN: int = 0
    for col in ws.iter_cols(min_row=1, max_row=1):
        for cell in col:
            MAX_LEN = len(str(cell.value))
            ADDITIONAL_LENGTH = (MAX_LEN + 2)
            ADJUSTED_WIDTH = 0
            ADJUSTED_WIDTH =  ADDITIONAL_LENGTH * 1.5
            ws.column_dimensions[utils_get_column_letter(cell.column)].width = ADJUSTED_WIDTH
            
    alignment = Alignment(vertical='center', horizontal='left')
    for row in ws.iter_rows():
        for cell in row:
            cell.alignment = alignment

    num_columns = ws.max_column
    header_range = f"A1:{utils_get_column_letter(num_columns)}1"
    ws.auto_filter.ref = header_range
    ws.freeze_panes = ws['A2']

    wrap_alignment = Alignment(wrap_text=True, vertical='top', horizontal='left')
    for row in ws.iter_rows(min_row=2):
        for cell in row:
            cell.alignment = wrap_alignment

In [62]:
save_to_excel(time_consumption_data, formatter_time, SHEET_NAME, os.getenv('OUTPUT_EXCEL_NAME_WITHOUT_ANNOTATIONS'))

PermissionError: [Errno 13] Permission denied: 'output_detections_highlighting_without_annotations.xlsx'