In [2]:
import json
import re

# Path to your JSONL file
file_path = '/external1/datasets/manifest_nemo/vils/valid_withpredictions.jsonl'

# Define precise regex patterns for each tag type
patterns = {
    'AGE': re.compile(r'\b(AGE_[0-9]+_[0-9]+)\b'),
    'GER': re.compile(r'\b(GER_[A-Z]+)\b'),
    'EMOTION': re.compile(r'\b(EMOTION_[A-Z]+)\b'),
    'INTENT': re.compile(r'\b(INTENT_[A-Z]+)\b'),
}

# Initialize dictionaries to store tag extractions
original = {tag: [] for tag in patterns}
predicted = {tag: [] for tag in patterns}

# Extract tags row-wise
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        obj = json.loads(line)
        text = obj.get('text', '')
        pred = obj.get('predicted_text', '')
        for tag, pattern in patterns.items():
            orig_match = pattern.search(text)
            pred_match = pattern.search(pred)
            original[tag].append(orig_match.group(1) if orig_match else None)
            predicted[tag].append(pred_match.group(1) if pred_match else None)

# Display the first 5 entries for each tag type
for tag in patterns:
    print(f"{tag}_original (first 5): {original[tag][:5]}")
    print(f"{tag}_predicted (first 5): {predicted[tag][:5]}")
    print()


AGE_original (first 5): ['AGE_30_45', 'AGE_45_60', 'AGE_18_30', 'AGE_45_60', 'AGE_18_30']
AGE_predicted (first 5): ['AGE_30_45', 'AGE_30_45', 'AGE_30_45', 'AGE_18_30', 'AGE_30_45']

GER_original (first 5): ['GER_FEMALE', 'GER_MALE', 'GER_MALE', 'GER_MALE', 'GER_FEMALE']
GER_predicted (first 5): ['GER_MALE', 'GER_MALE', 'GER_MALE', 'GER_FEMALE', 'GER_FEMALE']

EMOTION_original (first 5): ['EMOTION_SAD', 'EMOTION_SAD', 'EMOTION_SAD', 'EMOTION_SAD', 'EMOTION_SAD']
EMOTION_predicted (first 5): ['EMOTION_SAD', 'EMOTION_SAD', 'EMOTION_SAD', 'EMOTION_SAD', 'EMOTION_HAP']

INTENT_original (first 5): ['INTENT_UNCLEAR', 'INTENT_ASSERT', 'INTENT_INFORM', 'INTENT_INFORM', 'INTENT_START']
INTENT_predicted (first 5): ['INTENT_GREET', 'INTENT_INFORM', 'INTENT_INFORM', 'INTENT_INFORM', 'INTENT_GREET']



In [3]:
import pandas as pd
from IPython.display import display

# Build and display confusion matrices for each tag
for tag in original:
    # Create DataFrame from the existing lists
    df = pd.DataFrame({
        'Actual': original[tag],
        'Predicted': predicted[tag]
    }).dropna()
    
    # Compute confusion matrix
    cm = pd.crosstab(df['Actual'], df['Predicted'])
    
    # Display
    print(f"### {tag} Confusion Matrix")
    display(cm)


### AGE Confusion Matrix


Predicted,AGE_18_18,AGE_18_30,AGE_18_45,AGE_30_30,AGE_30_45
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AGE_0_18,2,115,5,0,3
AGE_18_30,1,459,15,0,50
AGE_30_45,0,217,14,3,212
AGE_45_60,0,27,4,0,60


### GER Confusion Matrix


Predicted,GER_FEMALE,GER_MALE
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
GER_FEMALE,436,59
GER_MALE,41,659


### EMOTION Confusion Matrix


Predicted,EMOTION_ANG,EMOTION_HAP,EMOTION_NEU,EMOTION_SAD
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
EMOTION_ANG,14,7,14,1
EMOTION_HAP,4,18,77,30
EMOTION_NEU,2,7,379,45
EMOTION_SAD,8,10,133,447


### INTENT Confusion Matrix


Predicted,INTENT_ACKNOWLEDGE,INTENT_ACTION,INTENT_EXPLAIN,INTENT_GREET,INTENT_INFORM,INTENT_INSTRUCT,INTENT_QUESTION,INTENT_REQUEST,INTENT_STATEMENT,INTENT_UNCLEAR
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
INTENT_ACCEPT,1,0,0,0,0,0,0,0,0,0
INTENT_ACKNOWLEDGE,1,0,0,10,2,0,0,0,0,0
INTENT_ACKNOWLEDGEMENT,1,0,0,3,1,0,0,0,0,0
INTENT_ACTION,0,1,0,3,8,0,0,0,0,2
INTENT_ADVISE,0,0,0,0,2,0,0,0,0,0
INTENT_AFFIRM,0,0,0,0,2,0,0,0,0,0
INTENT_AGREEMENT,0,0,0,0,1,0,0,0,0,0
INTENT_APOLOGIZE,0,0,0,1,0,0,0,0,0,0
INTENT_APOLOGY,0,0,0,1,0,0,0,0,0,0
INTENT_ASSERT,0,0,0,0,5,0,0,0,0,0


In [4]:
from sklearn.metrics import f1_score, classification_report
import pandas as pd
from IPython.display import display

# Calculate and display F1 scores and full classification reports for each tag
for tag in original:
    y_true = original[tag]
    y_pred = predicted[tag]
    # Filter out entries where actual or predicted is None
    filtered = [(a, p) for a, p in zip(y_true, y_pred) if a is not None and p is not None]
    if not filtered:
        continue
    y_true_f, y_pred_f = zip(*filtered)
    
    # Full classification report as DataFrame
    report_dict = classification_report(y_true_f, y_pred_f, output_dict=True)
    df_report = pd.DataFrame(report_dict).T
    
    # Print tag header and display report
    print(f"### {tag} Classification Report")
    display(df_report)
    
    # Compute macro and micro F1
    macro_f1 = f1_score(y_true_f, y_pred_f, average='macro')
    micro_f1 = f1_score(y_true_f, y_pred_f, average='micro')
    print(f"{tag} Macro F1-score: {macro_f1:.3f}")
    print(f"{tag} Micro F1-score: {micro_f1:.3f}\n")


### AGE Classification Report


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,precision,recall,f1-score,support
AGE_0_18,0.0,0.0,0.0,125.0
AGE_18_18,0.0,0.0,0.0,0.0
AGE_18_30,0.561125,0.874286,0.683544,525.0
AGE_18_45,0.0,0.0,0.0,0.0
AGE_30_30,0.0,0.0,0.0,0.0
AGE_30_45,0.652308,0.475336,0.549935,446.0
AGE_45_60,0.0,0.0,0.0,91.0
accuracy,0.565291,0.565291,0.565291,0.565291
macro avg,0.173347,0.192803,0.176211,1187.0
weighted avg,0.493277,0.565291,0.508957,1187.0


AGE Macro F1-score: 0.176
AGE Micro F1-score: 0.565

### GER Classification Report


Unnamed: 0,precision,recall,f1-score,support
GER_FEMALE,0.914046,0.880808,0.897119,495.0
GER_MALE,0.917827,0.941429,0.929478,700.0
accuracy,0.916318,0.916318,0.916318,0.916318
macro avg,0.915937,0.911118,0.913299,1195.0
weighted avg,0.916261,0.916318,0.916074,1195.0


GER Macro F1-score: 0.913
GER Micro F1-score: 0.916

### EMOTION Classification Report


Unnamed: 0,precision,recall,f1-score,support
EMOTION_ANG,0.5,0.388889,0.4375,36.0
EMOTION_HAP,0.428571,0.139535,0.210526,129.0
EMOTION_NEU,0.628524,0.875289,0.73166,433.0
EMOTION_SAD,0.854685,0.747492,0.797502,598.0
accuracy,0.717391,0.717391,0.717391,0.717391
macro avg,0.602945,0.537801,0.544297,1196.0
weighted avg,0.716169,0.717391,0.699518,1196.0


EMOTION Macro F1-score: 0.544
EMOTION Micro F1-score: 0.717

### INTENT Classification Report


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,precision,recall,f1-score,support
INTENT_ACCEPT,0.0,0.0,0.0,1.0
INTENT_ACKNOWLEDGE,0.066667,0.076923,0.071429,13.0
INTENT_ACKNOWLEDGEMENT,0.0,0.0,0.0,5.0
INTENT_ACTION,1.0,0.071429,0.133333,14.0
INTENT_ADVISE,0.0,0.0,0.0,2.0
INTENT_AFFIRM,0.0,0.0,0.0,2.0
INTENT_AGREEMENT,0.0,0.0,0.0,1.0
INTENT_APOLOGIZE,0.0,0.0,0.0,1.0
INTENT_APOLOGY,0.0,0.0,0.0,1.0
INTENT_ASSERT,0.0,0.0,0.0,5.0


INTENT Macro F1-score: 0.058
INTENT Micro F1-score: 0.659

