In [1]:
# Lucidity Score (LS) Calculation
import pandas as pd
import numpy as np

def calculate_lucidity_score():
    """
    Calculate Lucidity Score based on actual evaluation data
    """
    
    # Average data across your multiple CSV runs
    model_data = {
        'Model': ['deepseek-llm', 'mistral:7b', 'llama3:8b', 'gemma:7b', 'qwen2.5:3b'],
        'TruthfulQA': [53.3, 53.3, 56.0, 50.7, 52.7],
        'HHEMRate': [4.0, 5.7, 6.5, 2.5, 5.0],  # Your actual HHEM scores
        'Medical': [20.7, 28.3, 30.5, 24.8, 34.9],
        'Legal': [17.2, 29.2, 28.5, 13.2, 28.2], 
        'Scientific': [15.0, 19.3, 16.3, 18.4, 17.7],
        'Lucidity': [2.6, 2.0, 0.1, 3.3, 0.0]  # Average from your CSV data
    }
    
    df = pd.DataFrame(model_data)
    
    # Calculate components for LS formula
    # 1. TruthfulQA (already 0-100 scale)
    df['TruthfulQA_component'] = df['TruthfulQA']
    
    # 2. HHEM normalized (higher HHEM = better, normalize to 0-100)
    max_hhem = df['HHEMRate'].max()
    df['HHEM_normalized'] = (df['HHEMRate'] / max_hhem) * 100
    
    # 3. Domain average (Medical + Legal + Scientific) / 3
    df['Domain_avg'] = (df['Medical'] + df['Legal'] + df['Scientific']) / 3
    
    # 4. Lucidity penalty (100 - lucidity score, so higher lucidity = lower penalty)
    df['Lucidity_penalty'] = 100 - df['Lucidity']
    
    # Calculate Lucidity Score (LS)
    # LS = 0.25*TruthfulQA + 0.20*HHEM_norm + 0.35*Domain_avg + 0.20*Lucidity_penalty
    df['LS'] = (0.25 * df['TruthfulQA_component'] + 
                 0.20 * df['HHEM_normalized'] + 
                 0.35 * df['Domain_avg'] + 
                 0.20 * df['Lucidity_penalty'])
    
    # Rank models by LS (higher = better)
    df['LS_Rank'] = df['LS'].rank(ascending=False, method='min')
    
    # Display results
    print("=== Lucidity Score (LS) Results ===\n")
    
    # Show component breakdown
    components_df = df[['Model', 'TruthfulQA_component', 'HHEM_normalized', 
                       'Domain_avg', 'Lucidity_penalty', 'LS', 'LS_Rank']]
    
    print("Component Breakdown:")
    for _, row in components_df.iterrows():
        print(f"\n{row['Model']}:")
        print(f"  TruthfulQA: {row['TruthfulQA_component']:.1f}")
        print(f"  HHEM (normalized): {row['HHEM_normalized']:.1f}")  
        print(f"  Domain Average: {row['Domain_avg']:.1f}")
        print(f"  Lucidity Penalty: {row['Lucidity_penalty']:.1f}")
        print(f"  LS: {row['LS']:.1f} (Rank: {int(row['LS_Rank'])})")
    
    print(f"\n{'='*50}")
    print("Final LS Rankings:")
    print(f"{'='*50}")
    
    ranked_df = components_df.sort_values('LS_Rank')
    for i, (_, row) in enumerate(ranked_df.iterrows()):
        print(f"{int(row['LS_Rank'])}. {row['Model']}: {row['LS']:.1f}")
    
    return df

# Run LS calculation
ls_results = calculate_lucidity_score()

=== Lucidity Score (CRS) Results ===

Component Breakdown:

deepseek-llm:
  TruthfulQA: 53.3
  HHEM (normalized): 61.5
  Domain Average: 17.6
  Lucidity Penalty: 97.4
  CRS: 51.3 (Rank: 4)

mistral:7b:
  TruthfulQA: 53.3
  HHEM (normalized): 87.7
  Domain Average: 25.6
  Lucidity Penalty: 98.0
  CRS: 59.4 (Rank: 2)

llama3:8b:
  TruthfulQA: 56.0
  HHEM (normalized): 100.0
  Domain Average: 25.1
  Lucidity Penalty: 99.9
  CRS: 62.8 (Rank: 1)

gemma:7b:
  TruthfulQA: 50.7
  HHEM (normalized): 38.5
  Domain Average: 18.8
  Lucidity Penalty: 96.7
  CRS: 46.3 (Rank: 5)

qwen2.5:3b:
  TruthfulQA: 52.7
  HHEM (normalized): 76.9
  Domain Average: 26.9
  Lucidity Penalty: 100.0
  CRS: 58.0 (Rank: 3)

Final LS Rankings:
1. llama3:8b: 62.8
2. mistral:7b: 59.4
3. qwen2.5:3b: 58.0
4. deepseek-llm: 51.3
5. gemma:7b: 46.3


In [2]:
# Lucidity Score (LS) Calculation
import pandas as pd
import numpy as np

def calculate_lucidity_score():
    """
    Calculate Lucidity Score based on actual evaluation data
    """
    
    # Average data across your multiple CSV runs
    model_data = {
        'Model': ['deepseek-llm', 'mistral:7b', 'llama3:8b', 'gemma:7b', 'qwen2.5:3b'],
        'TruthfulQA': [52.7, 53.3, 56.0, 50.0, 51.3],
        'HHEMRate': [5.0, 5.2, 7.0, 1.9, 4.3],  # Your actual HHEM scores
        'Medical': [19.2, 32.8, 26.4, 21.9, 32.1],
        'Legal': [21.3, 27.1, 26.4, 11.2, 31.6], 
        'Scientific': [14.4, 18.0, 17.1, 18.4, 18.9],
        'Lucidity': [2.9, 1.8, 0.0, 2.7, 0.0]  # Average from your CSV data
    }
    
    df = pd.DataFrame(model_data)
    
    # Calculate components for LS formula
    # 1. TruthfulQA (already 0-100 scale)
    df['TruthfulQA_component'] = df['TruthfulQA']
    
    # 2. HHEM normalized (higher HHEM = better, normalize to 0-100)
    max_hhem = df['HHEMRate'].max()
    df['HHEM_normalized'] = (df['HHEMRate'] / max_hhem) * 100
    
    # 3. Domain average (Medical + Legal + Scientific) / 3
    df['Domain_avg'] = (df['Medical'] + df['Legal'] + df['Scientific']) / 3
    
    # 4. Lucidity penalty (100 - lucidity score, so higher lucidity = lower penalty)
    df['Lucidity_penalty'] = 100 - df['Lucidity']
    
    # Calculate Lucidity Score (LS)
    # LS = 0.25*TruthfulQA + 0.20*HHEM_norm + 0.35*Domain_avg + 0.20*Lucidity_penalty
    df['LS'] = (0.25 * df['TruthfulQA_component'] + 
                 0.20 * df['HHEM_normalized'] + 
                 0.35 * df['Domain_avg'] + 
                 0.20 * df['Lucidity_penalty'])
    
    # Rank models by LS (higher = better)
    df['LS_Rank'] = df['LS'].rank(ascending=False, method='min')
    
    # Display results
    print("=== Lucidity Score (LS) Results ===\n")
    
    # Show component breakdown
    components_df = df[['Model', 'TruthfulQA_component', 'HHEM_normalized', 
                       'Domain_avg', 'Lucidity_penalty', 'LS', 'LS_Rank']]
    
    print("Component Breakdown:")
    for _, row in components_df.iterrows():
        print(f"\n{row['Model']}:")
        print(f"  TruthfulQA: {row['TruthfulQA_component']:.1f}")
        print(f"  HHEM (normalized): {row['HHEM_normalized']:.1f}")  
        print(f"  Domain Average: {row['Domain_avg']:.1f}")
        print(f"  Lucidity Penalty: {row['Lucidity_penalty']:.1f}")
        print(f"  LS: {row['LS']:.1f} (Rank: {int(row['LS_Rank'])})")
    
    print(f"\n{'='*50}")
    print("Final LS Rankings:")
    print(f"{'='*50}")
    
    ranked_df = components_df.sort_values('LS_Rank')
    for i, (_, row) in enumerate(ranked_df.iterrows()):
        print(f"{int(row['LS_Rank'])}. {row['Model']}: {row['LS']:.1f}")
    
    return df

# Run LS calculation
ls_results = calculate_lucidity_score()

=== Lucidity Score (LS) Results ===

Component Breakdown:

deepseek-llm:
  TruthfulQA: 52.7
  HHEM (normalized): 71.4
  Domain Average: 18.3
  Lucidity Penalty: 97.1
  LS: 53.3 (Rank: 4)

mistral:7b:
  TruthfulQA: 53.3
  HHEM (normalized): 74.3
  Domain Average: 26.0
  Lucidity Penalty: 98.2
  LS: 56.9 (Rank: 2)

llama3:8b:
  TruthfulQA: 56.0
  HHEM (normalized): 100.0
  Domain Average: 23.3
  Lucidity Penalty: 100.0
  LS: 62.2 (Rank: 1)

gemma:7b:
  TruthfulQA: 50.0
  HHEM (normalized): 27.1
  Domain Average: 17.2
  Lucidity Penalty: 97.3
  LS: 43.4 (Rank: 5)

qwen2.5:3b:
  TruthfulQA: 51.3
  HHEM (normalized): 61.4
  Domain Average: 27.5
  Lucidity Penalty: 100.0
  LS: 54.7 (Rank: 3)

Final LS Rankings:
1. llama3:8b: 62.2
2. mistral:7b: 56.9
3. qwen2.5:3b: 54.7
4. deepseek-llm: 53.3
5. gemma:7b: 43.4
