### Install dependencies

In [332]:
%pip install scikit-learn
%pip install pandas

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Load files


In [333]:
import pandas as pd

def load_and_process(req_path, test_path):
    req_df = pd.read_csv(req_path, sep=',', on_bad_lines='skip')
    test_df = pd.read_csv(test_path, sep=',', on_bad_lines='skip')
    
    # Some Purpose columns are intentionally left blank for now; populate them with empty strings
    test_df['Purpose'] = test_df['Purpose'].fillna('')

    req_text_fields = ['Feature', 'Description']
    test_text_fields = ['Purpose', 'Test steps']

    # Combine text columns for similarity matching
    req_df['full_text'] = req_df[req_text_fields].astype(str).agg(' '.join, axis=1)
    test_df['full_text'] = test_df[test_text_fields].astype(str).agg(' '.join, axis=1)
    
    # Convert to lists
    req_texts = req_df['full_text'].tolist()
    test_texts = test_df['full_text'].tolist()

    return req_df, test_df, req_texts, test_texts



### Cosine Similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine(req_df, test_df, req_texts, test_texts):
    vectorizer = TfidfVectorizer()
    documents = req_texts + test_texts
    tfidf_matrix = vectorizer.fit_transform(documents)

    similarity_matrix = cosine_similarity(tfidf_matrix[:len(req_texts)], tfidf_matrix[len(req_texts):])

    # TODO: tweak and see if it can perform better
    threshold = 0.38

    # Find matches for each requirement
    matches = {}
    for i, req in enumerate(req_df['ID']):
        matched_test_cases = []
        for j, similarity in enumerate(similarity_matrix[i]):
            if similarity >= threshold:
                matched_test_cases.append(test_df.iloc[j]['ID'])

        matches[req] = matched_test_cases

    # Results
    return sorted(matches.items())


### Calculate accuracy

In [335]:
def calculate_accuracy(predicted_matches, ground_truth_path):
    
    # Load and clean ground truth
    gt_df = pd.read_csv(ground_truth_path).dropna(subset=['Req ID'])
    
    # Convert ground truth data into a dictionary
    gt_dict = {
        row['Req ID']: list(map(str, str(row['Test ID']).split(','))) if pd.notna(row['Test ID']) else []
        for _, row in gt_df.iterrows()
    }

    # Populate predicted dict
    pred_dict = {}
    for req_text, test_texts in predicted_matches:
        req_id = req_text.split()[0]
        if req_id not in pred_dict:
            pred_dict[req_id] = []
        pred_dict[req_id].extend(str(test_text).split()[0] for test_text in test_texts)
    
    correct = 0
    total = 0

    # Compare predicted and ground truth
    for req_id in set(pred_dict.keys()) | set(gt_dict.keys()):
        predicted = set(pred_dict.get(req_id, []))
        actual = set(gt_dict.get(req_id, []))
        total += 1
        if predicted == actual:
            correct += 1
    # Calculate accuracy
    accuracy = (correct / total) * 100 if total > 0 else 0

    print("*********************************************")
    print(f"Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")
    print("*********************************************")
    return accuracy


### Display predicted and Ground Truth results side by side

In [339]:
def print_side_by_side(matches, mapping_path):
    
    # Convert to dict
    pred_dict = {req_id: test_ids for req_id, test_ids in matches}
    # Ground Truth Dataset
    gt_df = pd.read_csv(mapping_path)

    # Convert to dict
    gt_dict = {
        row['Req ID']: list(map(str, str(row['Test ID']).split(','))) if pd.notna(row['Test ID']) else []
        for _, row in gt_df.iterrows()
    }

    # Get the union of all requirement IDs
    all_req_ids = sorted(set(pred_dict.keys()) | set(gt_dict.keys()))

    print(f"\n{'Req ID':<10} | {'Predicted':<20} | {'Ground Truth'}")
    print("-" * 55)

    for req_id in all_req_ids:
        pred_ids = ', '.join(map(str, pred_dict.get(req_id, [])))
        gt_ids = ', '.join(gt_dict.get(req_id, []))
        print(f"{req_id:<10} | {pred_ids:<20} | {gt_ids}")


In [342]:
# Datasets folders 
datasets = ['BTHS', 'ENCO', 'SnakeGame', 'Mozilla', 'HealthWatcher', 'SourceTracker']
#Initialize summary table for visualization of results
summary_table = pd.DataFrame(columns=["Dataset", "Accuracy"])

# For every dataset find the following:
for dataset in datasets:
    req_path = f'../data/{dataset}/RE.csv'
    test_path = f'../data/{dataset}/ST.csv'
    ground_truth_mapping_path = f'../data/{dataset}/mapping.csv'

    print(f"Dataset: {dataset}")
    # Load and process data
    req_df, test_df, req_texts, test_texts = load_and_process(req_path, test_path)
    # Find cosine similarity
    predicted = compute_cosine(req_df, test_df, req_texts, test_texts)
    # Print results side by side 
    print_side_by_side(predicted, ground_truth_mapping_path)
    # Compute accuracy
    accuracy = calculate_accuracy(predicted, ground_truth_mapping_path)

    summary_table = pd.concat([
        summary_table,
        pd.DataFrame([{"Dataset": dataset, "Accuracy": accuracy}])
    ], ignore_index=True)

    print("\n")


# Print Results Table
print("+------------------+---------------+")
print("| Dataset          | Accuracy (%)  |")
print("+------------------+---------------+")
for _, row in summary_table.iterrows():
    accuracy_str = f"{row['Accuracy']:.2f}%"
    print(f"| {row['Dataset']:<16} | {accuracy_str:<13.7} |") 
print("+------------------+---------------+")


Dataset: BTHS
['Audio Gateway Initiated ACL Connection Establishment Upon an internal or user generated event, the AG will initiate connection establishment. There are then two options as described in the  4.2.1 and  4.2.2. The SCO link establishment can take place anytime after the ACL connection establishment.', 'Using In-Band Ringing An in-band ring tone is an audible alert, such as a tone, melody, short music clip, that is transmitted by the AG, to the HS, to alert the user of an event; typically an incoming call. The AG may generate an in-band ring tone using the SCO connection to the HS. The AG decides how to use this SCO connection. When using an in-band ring tone, the AG shall not send the RING unsolicited result code to the HS3..', 'Using the RING message The AG will repeatedly send the RING unsolicited result code to the HS for a time period decided by the AG. The RING may be repeated for as long as the connection establishment is pending.', 'Headset Initiated ACL Connection 

  summary_table = pd.concat([
