# import

In [70]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns

# MOCK Ground Truth

In [71]:
np.random.seed(42)

NUM_NFRS = 20
NUM_PARTICIPANTS = 2

In [72]:
satisfaction_levels = ['Satisfied', 'Weakly Satisfied', 'Weakly Denied', 'Denied', 'NA']

ground_truth_data = {
    'nfr_id': [f'NFR_{i+1}' for i in range(NUM_NFRS)],
    'satisfaction_level': np.random.choice(satisfaction_levels, NUM_NFRS),
    'reasoning_text': [f'Reasoning for NFR_{i+1}: The system uses encryption and proper authentication.' for i in range(NUM_NFRS)],
    'code_locations': [[f'file_{i}.py:lines_10-20', f'file_{i}.py:lines_45-50'] for i in range(NUM_NFRS)],
}

llm_response_data = {
    'nfr_id': [f'NFR_{i+1}' for i in range(NUM_NFRS)],
    'satisfaction_level': [],
    'reasoning_text': [],
    'code_locations': [],
}

for i in range(NUM_NFRS):
    if np.random.random() < 0.8:
        llm_response_data['satisfaction_level'].append(ground_truth_data['satisfaction_level'][i])
    else:
        other_levels = [l for l in satisfaction_levels if l != ground_truth_data['satisfaction_level'][i]]
        llm_response_data['satisfaction_level'].append(np.random.choice(other_levels))
    
    if np.random.random() < 0.85:
        llm_response_data['reasoning_text'].append(ground_truth_data['reasoning_text'][i])
    else:
        llm_response_data['reasoning_text'].append(f'Reasoning for NFR_{i+1}: Alternative explanation with different focus.')
    
    gt_locations = ground_truth_data['code_locations'][i]
    if np.random.random() < 0.75:
        llm_response_data['code_locations'].append(gt_locations)
    else:
        partial = gt_locations[:1] + [f'file_{i}_wrong.py:lines_30-35']
        llm_response_data['code_locations'].append(partial)

df_ground_truth = pd.DataFrame(ground_truth_data)
df_llm_response = pd.DataFrame(llm_response_data)


In [73]:
df_ground_truth

Unnamed: 0,nfr_id,satisfaction_level,reasoning_text,code_locations
0,NFR_1,Denied,Reasoning for NFR_1: The system uses encryptio...,"[file_0.py:lines_10-20, file_0.py:lines_45-50]"
1,NFR_2,,Reasoning for NFR_2: The system uses encryptio...,"[file_1.py:lines_10-20, file_1.py:lines_45-50]"
2,NFR_3,Weakly Denied,Reasoning for NFR_3: The system uses encryptio...,"[file_2.py:lines_10-20, file_2.py:lines_45-50]"
3,NFR_4,,Reasoning for NFR_4: The system uses encryptio...,"[file_3.py:lines_10-20, file_3.py:lines_45-50]"
4,NFR_5,,Reasoning for NFR_5: The system uses encryptio...,"[file_4.py:lines_10-20, file_4.py:lines_45-50]"
5,NFR_6,Weakly Satisfied,Reasoning for NFR_6: The system uses encryptio...,"[file_5.py:lines_10-20, file_5.py:lines_45-50]"
6,NFR_7,Weakly Denied,Reasoning for NFR_7: The system uses encryptio...,"[file_6.py:lines_10-20, file_6.py:lines_45-50]"
7,NFR_8,Weakly Denied,Reasoning for NFR_8: The system uses encryptio...,"[file_7.py:lines_10-20, file_7.py:lines_45-50]"
8,NFR_9,Weakly Denied,Reasoning for NFR_9: The system uses encryptio...,"[file_8.py:lines_10-20, file_8.py:lines_45-50]"
9,NFR_10,,Reasoning for NFR_10: The system uses encrypti...,"[file_9.py:lines_10-20, file_9.py:lines_45-50]"


In [74]:
df_llm_response

Unnamed: 0,nfr_id,satisfaction_level,reasoning_text,code_locations
0,NFR_1,Denied,Reasoning for NFR_1: The system uses encryptio...,"[file_0.py:lines_10-20, file_0.py:lines_45-50]"
1,NFR_2,,Reasoning for NFR_2: The system uses encryptio...,"[file_1.py:lines_10-20, file_1.py:lines_45-50]"
2,NFR_3,Denied,Reasoning for NFR_3: The system uses encryptio...,"[file_2.py:lines_10-20, file_2_wrong.py:lines_..."
3,NFR_4,,Reasoning for NFR_4: The system uses encryptio...,"[file_3.py:lines_10-20, file_3.py:lines_45-50]"
4,NFR_5,,Reasoning for NFR_5: The system uses encryptio...,"[file_4.py:lines_10-20, file_4.py:lines_45-50]"
5,NFR_6,Weakly Satisfied,Reasoning for NFR_6: Alternative explanation w...,"[file_5.py:lines_10-20, file_5_wrong.py:lines_..."
6,NFR_7,Satisfied,Reasoning for NFR_7: The system uses encryptio...,"[file_6.py:lines_10-20, file_6.py:lines_45-50]"
7,NFR_8,Weakly Denied,Reasoning for NFR_8: The system uses encryptio...,"[file_7.py:lines_10-20, file_7.py:lines_45-50]"
8,NFR_9,Denied,Reasoning for NFR_9: Alternative explanation w...,"[file_8.py:lines_10-20, file_8.py:lines_45-50]"
9,NFR_10,,Reasoning for NFR_10: The system uses encrypti...,"[file_9.py:lines_10-20, file_9.py:lines_45-50]"


# MOCK Participants Data (for each participant-NFR)

In [82]:
participant_nfr_data = []
for participant_id in range(1, NUM_PARTICIPANTS + 1):    
    for nfr_idx in range(1, NUM_NFRS + 1):
        nfr_id = f'NFR_{nfr_idx}'
        agreement_options = ['Agree', 'Partially agree', 'Partially disagree', 'Disagree']
        
        llm_row = df_llm_response[df_llm_response['nfr_id'] == nfr_id].iloc[0]

        if np.random.random() < 0.8:
            participant_satisfaction_level = llm_row['satisfaction_level']
        else:
            alt_levels = [lvl for lvl in satisfaction_levels if lvl != llm_row['satisfaction_level']]
            participant_satisfaction_level = np.random.choice(alt_levels)

        if np.random.random() < 0.8:
            participant_reasoning_text = llm_row['reasoning_text']
        else:
            participant_reasoning_text = (
            f"Participant {participant_id} alternate reasoning for {nfr_id}: Different focus on usability and performance aspects.")

        if np.random.random() < 0.8:
            participant_code_locations = llm_row['code_locations']
        else:
            participant_code_locations = llm_row['code_locations'][:1] + [f"file_{nfr_idx}_participant.py:lines_60-70"]

        participant_nfr_data.append({
            'participant_id': participant_id,
            'nfr_id': nfr_id,
            'q1_satisfaction_agreement': np.random.choice(agreement_options, p=[0.5, 0.2, 0.2, 0.1]),
            'q2_reasoning_agreement': np.random.choice(agreement_options, p=[0.5, 0.25, 0.15, 0.1]),
            'q3_code_location_agreement': np.random.choice(agreement_options, p=[0.45, 0.25, 0.2, 0.1]),
            'participant_satisfaction_level': participant_satisfaction_level,
            'participant_reasoning_text': participant_reasoning_text,
            'participant_code_locations': participant_code_locations,
        })

df_participant_nfr = pd.DataFrame(participant_nfr_data)
df_participant_nfr

Unnamed: 0,participant_id,nfr_id,q1_satisfaction_agreement,q2_reasoning_agreement,q3_code_location_agreement,participant_satisfaction_level,participant_reasoning_text,participant_code_locations
0,1,NFR_1,Partially agree,Partially agree,Agree,Denied,Reasoning for NFR_1: The system uses encryptio...,"[file_0.py:lines_10-20, file_0.py:lines_45-50]"
1,1,NFR_2,Partially disagree,Partially agree,Agree,,Reasoning for NFR_2: The system uses encryptio...,"[file_1.py:lines_10-20, file_1.py:lines_45-50]"
2,1,NFR_3,Partially agree,Disagree,Partially agree,Denied,Reasoning for NFR_3: The system uses encryptio...,"[file_2.py:lines_10-20, file_2_wrong.py:lines_..."
3,1,NFR_4,Partially agree,Disagree,Agree,,Reasoning for NFR_4: The system uses encryptio...,"[file_3.py:lines_10-20, file_3.py:lines_45-50]"
4,1,NFR_5,Agree,Agree,Agree,Weakly Satisfied,Reasoning for NFR_5: The system uses encryptio...,"[file_4.py:lines_10-20, file_4.py:lines_45-50]"


In [76]:
df_participant_nfr

Unnamed: 0,participant_id,nfr_id,q1_satisfaction_agreement,q2_reasoning_agreement,q3_code_location_agreement,participant_satisfaction_level,participant_reasoning_text,participant_code_locations
0,1,NFR_1,Agree,Agree,Agree,Denied,Participant 1 alternate reasoning for NFR_1: D...,"[file_0.py:lines_10-20, file_1_participant.py:..."
1,1,NFR_2,Agree,Partially agree,Agree,,Reasoning for NFR_2: The system uses encryptio...,"[file_1.py:lines_10-20, file_1.py:lines_45-50]"
2,1,NFR_3,Partially disagree,Agree,Partially agree,Weakly Denied,Reasoning for NFR_3: The system uses encryptio...,"[file_2.py:lines_10-20, file_2_wrong.py:lines_..."
3,1,NFR_4,Agree,Partially agree,Agree,,Reasoning for NFR_4: The system uses encryptio...,"[file_3.py:lines_10-20, file_3.py:lines_45-50]"
4,1,NFR_5,Agree,Partially disagree,Agree,,Participant 1 alternate reasoning for NFR_5: D...,"[file_4.py:lines_10-20, file_4.py:lines_45-50]"
5,1,NFR_6,Disagree,Partially disagree,Partially agree,Weakly Satisfied,Reasoning for NFR_6: Alternative explanation w...,"[file_5.py:lines_10-20, file_5_wrong.py:lines_..."
6,1,NFR_7,Partially disagree,Agree,Agree,,Reasoning for NFR_7: The system uses encryptio...,"[file_6.py:lines_10-20, file_6.py:lines_45-50]"
7,1,NFR_8,Partially agree,Agree,Agree,Weakly Denied,Participant 1 alternate reasoning for NFR_8: D...,"[file_7.py:lines_10-20, file_7.py:lines_45-50]"
8,1,NFR_9,Agree,Partially agree,Agree,Denied,Reasoning for NFR_9: Alternative explanation w...,"[file_8.py:lines_10-20, file_8.py:lines_45-50]"
9,1,NFR_10,Partially agree,Agree,Agree,,Reasoning for NFR_10: The system uses encrypti...,"[file_9.py:lines_10-20, file_9.py:lines_45-50]"


# RQ1 ANALYSIS: Assessment Accuracy

In [77]:
def calculate_code_location_metrics(gt_locations, llm_locations):
    """
    Calculate Precision, Recall, and F1 for code locations.
    Treats locations as sets and computes overlap.
    """
    gt_set = set(gt_locations)
    llm_set = set(llm_locations)
    
    if len(llm_set) == 0:
        precision = 0.0
    else:
        precision = len(gt_set.intersection(llm_set)) / len(llm_set)
    
    if len(gt_set) == 0:
        recall = 0.0
    else:
        recall = len(gt_set.intersection(llm_set)) / len(gt_set)
    
    if precision + recall == 0:
        f1 = 0.0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    
    return precision, recall, f1

print("=" * 80)
print("RQ1: ASSESSMENT ACCURACY ANALYSIS")
print("=" * 80)

# Merge ground truth and LLM responses for comparison
df_comparison = df_ground_truth.merge(
    df_llm_response, 
    on='nfr_id', 
    suffixes=('_gt', '_llm')
)

RQ1: ASSESSMENT ACCURACY ANALYSIS


In [78]:
df_comparison

Unnamed: 0,nfr_id,satisfaction_level_gt,reasoning_text_gt,code_locations_gt,satisfaction_level_llm,reasoning_text_llm,code_locations_llm
0,NFR_1,Denied,Reasoning for NFR_1: The system uses encryptio...,"[file_0.py:lines_10-20, file_0.py:lines_45-50]",Denied,Reasoning for NFR_1: The system uses encryptio...,"[file_0.py:lines_10-20, file_0.py:lines_45-50]"
1,NFR_2,,Reasoning for NFR_2: The system uses encryptio...,"[file_1.py:lines_10-20, file_1.py:lines_45-50]",,Reasoning for NFR_2: The system uses encryptio...,"[file_1.py:lines_10-20, file_1.py:lines_45-50]"
2,NFR_3,Weakly Denied,Reasoning for NFR_3: The system uses encryptio...,"[file_2.py:lines_10-20, file_2.py:lines_45-50]",Denied,Reasoning for NFR_3: The system uses encryptio...,"[file_2.py:lines_10-20, file_2_wrong.py:lines_..."
3,NFR_4,,Reasoning for NFR_4: The system uses encryptio...,"[file_3.py:lines_10-20, file_3.py:lines_45-50]",,Reasoning for NFR_4: The system uses encryptio...,"[file_3.py:lines_10-20, file_3.py:lines_45-50]"
4,NFR_5,,Reasoning for NFR_5: The system uses encryptio...,"[file_4.py:lines_10-20, file_4.py:lines_45-50]",,Reasoning for NFR_5: The system uses encryptio...,"[file_4.py:lines_10-20, file_4.py:lines_45-50]"
5,NFR_6,Weakly Satisfied,Reasoning for NFR_6: The system uses encryptio...,"[file_5.py:lines_10-20, file_5.py:lines_45-50]",Weakly Satisfied,Reasoning for NFR_6: Alternative explanation w...,"[file_5.py:lines_10-20, file_5_wrong.py:lines_..."
6,NFR_7,Weakly Denied,Reasoning for NFR_7: The system uses encryptio...,"[file_6.py:lines_10-20, file_6.py:lines_45-50]",Satisfied,Reasoning for NFR_7: The system uses encryptio...,"[file_6.py:lines_10-20, file_6.py:lines_45-50]"
7,NFR_8,Weakly Denied,Reasoning for NFR_8: The system uses encryptio...,"[file_7.py:lines_10-20, file_7.py:lines_45-50]",Weakly Denied,Reasoning for NFR_8: The system uses encryptio...,"[file_7.py:lines_10-20, file_7.py:lines_45-50]"
8,NFR_9,Weakly Denied,Reasoning for NFR_9: The system uses encryptio...,"[file_8.py:lines_10-20, file_8.py:lines_45-50]",Denied,Reasoning for NFR_9: Alternative explanation w...,"[file_8.py:lines_10-20, file_8.py:lines_45-50]"
9,NFR_10,,Reasoning for NFR_10: The system uses encrypti...,"[file_9.py:lines_10-20, file_9.py:lines_45-50]",,Reasoning for NFR_10: The system uses encrypti...,"[file_9.py:lines_10-20, file_9.py:lines_45-50]"


https://www.baeldung.com/cs/multi-class-f1-score

https://towardsdatascience.com/micro-macro-weighted-averages-of-f1-score-clearly-explained-b603420b292f/

In [79]:
print("SATISFACTION LEVEL ASSESSMENT")

satisfaction_matches = (df_comparison['satisfaction_level_gt'] == df_comparison['satisfaction_level_llm']).sum()
satisfaction_accuracy = satisfaction_matches / len(df_comparison)

print(f"\nAccuracy: {satisfaction_accuracy:.1%} ({satisfaction_matches}/{len(df_comparison)})")

from sklearn.metrics import f1_score

macro_f1_satisfaction = f1_score(
    df_comparison['satisfaction_level_gt'],
    df_comparison['satisfaction_level_llm'],
    average='macro'
)

print(f"Macro-F1 Score: {macro_f1_satisfaction:.3f}")

kappa_satisfaction = cohen_kappa_score(
    df_comparison['satisfaction_level_gt'],
    df_comparison['satisfaction_level_llm']
)

print(f"Cohen's Kappa: {kappa_satisfaction:.3f}")


SATISFACTION LEVEL ASSESSMENT

Accuracy: 80.0% (16/20)
Macro-F1 Score: 0.783
Cohen's Kappa: 0.741


In [80]:
satisfaction_agreement = df_participant_nfr['q1_satisfaction_agreement'].value_counts()
total_assessments = len(df_participant_nfr)

print(f"\nParticipant Agreement Distribution:")
for option in ['Agree', 'Partially agree', 'Partially disagree', 'Disagree']:
    count = satisfaction_agreement.get(option, 0)
    percentage = (count / total_assessments) * 100
    print(f"  {option:20s}: {count:3d} ({percentage:5.1f}%)")

agree_count = 5 * satisfaction_agreement.get('Agree', 0) + 4 * satisfaction_agreement.get('Partially agree', 0) + 2 * satisfaction_agreement.get('Partially disagree', 0) + 1 * satisfaction_agreement.get('Disagree', 0)
agreement_rate = agree_count / total_assessments
print(f"\nOverall Agreement Rate: {agreement_rate}")

# Inter-rater reliability among participants
participants = df_participant_nfr['participant_id'].unique()[:2]
ratings_p1 = df_participant_nfr[df_participant_nfr['participant_id'] == participants[0]][['nfr_id', 'q1_satisfaction_agreement']]
ratings_p2 = df_participant_nfr[df_participant_nfr['participant_id'] == participants[1]][['nfr_id', 'q1_satisfaction_agreement']]

common = ratings_p1.merge(ratings_p2, on='nfr_id', suffixes=('_p1', '_p2'))

alpha_satisfaction = cohen_kappa_score(
    common['q1_satisfaction_agreement_p1'],
    common['q1_satisfaction_agreement_p2']
) if not common.empty else np.nan
print(f"Cohen's Kappa (participants {participants[0]} vs {participants[1]}): {alpha_satisfaction:.3f}")



Participant Agreement Distribution:
  Agree               :  22 ( 55.0%)
  Partially agree     :  10 ( 25.0%)
  Partially disagree  :   5 ( 12.5%)
  Disagree            :   3 (  7.5%)

Overall Agreement Rate: 4.075
Cohen's Kappa (participants 1 vs 2): -0.205


# RQ1 ANALYSIS: Reasoning Accuracy

In [81]:
# BLEU-like unigram precision for reasoning text
def simple_unigram_bleu(ref, hyp):
    ref_tokens = ref.lower().split()
    hyp_tokens = hyp.lower().split()
    if not hyp_tokens:
        return 0.0
    overlap = sum(min(ref_tokens.count(tok), hyp_tokens.count(tok)) for tok in set(hyp_tokens))
    return overlap / len(hyp_tokens)

bleu_scores = df_comparison.apply(
    lambda row: simple_unigram_bleu(row['reasoning_text_gt'], row['reasoning_text_llm']),
    axis=1
)
print(f"Average BLEU-like score (reasoning): {bleu_scores.mean():.3f}")

# semantic similarity
def calculate_semantic_similarity(text1, text2):
    """
    Simulate semantic similarity calculation.
    In practice, this would use BERT embeddings or similar.
    """
    if text1 == text2:
        return np.random.uniform(0.90, 1.0)
    else:       
        return np.random.uniform(0.60, 0.85)

reasoning_similarities = df_comparison.apply(
    lambda row: calculate_semantic_similarity(row['reasoning_text_gt'], row['reasoning_text_llm']),
    axis=1
)
print(f"Average semantic similarity (reasoning): {reasoning_similarities.mean():.3f}")

# f1 score


Average BLEU-like score (reasoning): 0.875
Average semantic similarity (reasoning): 0.893


# User Satisfaction Survey Data (RQ2)