## Import Libraries

In [20]:
from pydantic import BaseModel
from openai import AzureOpenAI
import os
import re
from typing import List, Dict, Tuple
import pandas as pd
import numpy as np
import csv
import json
from dotenv import load_dotenv
load_dotenv()

True

## Log Probabilities Class

In [25]:
class LogprobsHandler:
    """
    A utility class for handling log probabilities, extracting key names, 
    and processing structured log probabilities data.
    """
    def prob_to_logprob(self, prob: float) -> float:
        """
        Converts a probability to its logarithmic probability.
        
        Args:
            prob (float): The probability value.
        
        Returns:
            float: The logarithmic probability.
        """
        return np.log(prob)

    def logprob_to_prob(self, logprob: float) -> float:
        """
        Converts a logarithmic probability back to a probability.
        
        Args:
            logprob (float): The logarithmic probability.
        
        Returns:
            float: The corresponding probability.
        """
        return np.exp(logprob)

    def extract_key_name(self, key: str):
        """
        Extracts the key name from a structured key-value string.
        
        Args:
            key (str): The string containing the key.
        
        Returns:
            str or None: The extracted key name, or None if not found.
        """
        match = re.search(r'([^"]+)"\s*:', key)
        return match.group(1) if match else None

    def calculate_words_probas(self, logprobs_formatted: List[Dict]) -> List[Tuple[str, float]]:
        """
        Computes the probabilities of word sequences based on log probabilities.
        
        Args:
            logprobs_formatted (List[Dict]): A list of dictionaries containing tokens and log probabilities.
        
        Returns:
            List[Tuple[str, float]]: A list of tuples containing token sequences and their corresponding probabilities.
        """
        probas_df = pd.DataFrame({'token': [i['token'] for i in logprobs_formatted],
                                  'logprob': [i['logprob'] for i in logprobs_formatted]})

        key_value_pairs = []
        current_pair = []
        for idx, row in probas_df.iterrows():
            token = str(row['token'])
            if token.strip() != '' and not token.strip() in ['{', '}']:
                current_pair.append(idx)
            if token.endswith(',\n') or token.endswith(']\n') or token.strip().endswith(',') or token.strip().endswith(
                    '}') or token.endswith('"}') or token.endswith("'}") or token.endswith(',"') or token.endswith(
                ",'"):
                if len(current_pair) > 0:
                    key_value_pairs.append(current_pair)
                current_pair = []
        pair_probs = []
        for pair in key_value_pairs:
            pair_logprob = probas_df.loc[pair, 'logprob'].sum()
            pair_prob = self.logprob_to_prob(pair_logprob)
            pair_probs.append((''.join(probas_df.loc[pair, 'token']), pair_prob))
        return pair_probs

    def format_logprobs(self, logprobs) -> List[Dict]:
        """
        Formats raw log probability data into a structured dictionary format.
        
        Args:
            logprobs: Raw log probability data.
        
        Returns:
            List[Dict]: A formatted list of dictionaries containing token probabilities.
        """
        logprobs_formatted = []
        for logprob in logprobs:
            logprob_formatted = {'token': logprob.token, 'logprob': logprob.logprob,
                                 'log_topprobs': [{'token': log_topprob.token, 'logprob': log_topprob.logprob}
                                                  for log_topprob in logprob.top_logprobs]}
            logprobs_formatted.append(logprob_formatted)
        return logprobs_formatted

    def process_logprobs(self, logprobs_formatted: List[Dict], nested_keys_dct: Dict[str, List[str]] = None):
        """
        Processes formatted log probabilities into structured confidence scores per key field.
        
        Args:
            logprobs_formatted (List[Dict]): A list of formatted log probability dictionaries.
            nested_keys_dct (Dict[str, List[str]], optional): A dictionary mapping nested keys to their related values.
        
        Returns:
            Dict[str, float]: A dictionary of field names mapped to their aggregated probability scores.
        """
        pair_probs = self.calculate_words_probas(logprobs_formatted)
        pair_df = pd.DataFrame(pair_probs, columns=['key_value_pair', 'agg_tokens_proba'])
        pair_df['field_name'] = pair_df['key_value_pair'].apply(self.extract_key_name)
        pair_df = pair_df[pair_df['field_name'].notna()]

        if nested_keys_dct is not None:
            for nested_key_name, nested_key_values in nested_keys_dct.items():
                nested_key_str = '|'.join(nested_key_values)
                nested_rows = pair_df[pair_df['field_name'].str.contains(nested_key_str, case=False)]
                if len(nested_rows) > 0:
                    new_row = pd.DataFrame({
                        'key_value_pair': [' '.join(nested_rows['key_value_pair'])],
                        'agg_tokens_proba': [nested_rows['agg_tokens_proba'].prod()],
                        'field_name': [nested_key_name]
                    })
                    pair_df = pd.concat([pair_df, new_row], axis=0, ignore_index=True)
        pair_df['agg_tokens_proba'] = pair_df['agg_tokens_proba'].round(4)

        fields_llm_confidences = dict(pair_df.set_index('field_name')['agg_tokens_proba'].to_dict())
        return fields_llm_confidences

In [30]:
logprobs_handler = LogprobsHandler()

## Step by Step: How does the log probabilities class work? 

In the steps below we dissect the class step by step

### Step 1: Format API Response

The first function to be called from the LogprobsHandler() class is format_logprobs. This will format the raw API response probability data into a structured dictionary format.

Note, the output lloks something like this:

[{'token': '{"', 'logprob': -1.9361265e-07, 'log_topprobs': []}, {'token': 'name', 'logprob': 0.0, 'log_topprobs': []}, {'token': '":"', 'logprob': 0.0, 'log_topprobs': []}, {'token': 'Science', 'logprob': -0.008698363, 'log_topprobs': []}, {'token': ' Fair', 'logprob': -6.392203e-06, 'log_topprobs': []}, {'token': '","', 'logprob': -0.034407083, 'log_topprobs': []}, {'token': 'date', 'logprob': 0.0, 'log_topprobs': []}, {'token': '":"', 'logprob': 0.0, 'log_topprobs': []}, {'token': '202', 'logprob': -0.0017659782, 'log_topprobs': []}, {'token': '3', 'logprob': -0.0001307143, 'log_topprobs': []}, {'token': '-', 'logprob': 0.0, 'log_topprobs': []}, {'token': '10', 'logprob': -0.03874472, 'log_topprobs': []}, {'token': '-', 'logprob': 0.0, 'log_topprobs': []}, {'token': '06', 'logprob': -1.0780874, 'log_topprobs': []}, {'token': '","', 'logprob': -9.6629374e-05, 'log_topprobs': []}, {'token': 'participants', 'logprob': 0.0, 'log_topprobs': []}, {'token': '":["', 'logprob': -9.0883464e-07, 'log_topprobs': []}, {'token': 'Alice', 'logprob': 0.0, 'log_topprobs': []}, {'token': '","', 'logprob': -3.1281633e-07, 'log_topprobs': []}, {'token': 'Bob', 'logprob': 0.0, 'log_topprobs': []}, {'token': '"]', 'logprob': -5.5122365e-07, 'log_topprobs': []}, {'token': '}', 'logprob': -1.9361265e-07, 'log_topprobs': []}]

All of the words are seperated in the form of 'tokens' and we will need to process the data to get the confidence scores for each of the fields.

In [32]:
client = AzureOpenAI(
  azure_endpoint = os.environ.get("OPENAI_ENDPOINT"), 
  api_key=os.environ.get("OPENAI_API_KEY"),
  azure_deployment=os.environ.get("OPENAI_MODEL"),
  api_version="2024-10-21"
)

class CalendarEvent(BaseModel):
    name: str
    date: str
    participants: list[str]
    

response_raw = client.beta.chat.completions.parse(messages=[{'role': 'user', 'content': 'Alice and Bob are going to a science fair on Friday.'}], model=os.environ.get("OPENAI_MODEL"), logprobs=True, response_format=CalendarEvent)

response_logprobs = response_raw.choices[0].logprobs.content if hasattr(response_raw.choices[0], 'logprobs') else []

logprobs_formatted = logprobs_handler.format_logprobs(response_logprobs)

print(logprobs_formatted)

[{'token': '{"', 'logprob': -1.9361265e-07, 'log_topprobs': []}, {'token': 'name', 'logprob': 0.0, 'log_topprobs': []}, {'token': '":"', 'logprob': 0.0, 'log_topprobs': []}, {'token': 'Science', 'logprob': -0.008698363, 'log_topprobs': []}, {'token': ' Fair', 'logprob': -6.392203e-06, 'log_topprobs': []}, {'token': '","', 'logprob': -0.034407083, 'log_topprobs': []}, {'token': 'date', 'logprob': 0.0, 'log_topprobs': []}, {'token': '":"', 'logprob': 0.0, 'log_topprobs': []}, {'token': '202', 'logprob': -0.0017659782, 'log_topprobs': []}, {'token': '3', 'logprob': -0.0001307143, 'log_topprobs': []}, {'token': '-', 'logprob': 0.0, 'log_topprobs': []}, {'token': '10', 'logprob': -0.03874472, 'log_topprobs': []}, {'token': '-', 'logprob': 0.0, 'log_topprobs': []}, {'token': '06', 'logprob': -1.0780874, 'log_topprobs': []}, {'token': '","', 'logprob': -9.6629374e-05, 'log_topprobs': []}, {'token': 'participants', 'logprob': 0.0, 'log_topprobs': []}, {'token': '":["', 'logprob': -9.0883464e-0

### Step 2: Process the log probs

To process the log probs we need to run a few functions:

-	calculate_words_probas: Computes the probabilities of word sequences based on log probabilities.

-	extract_key_name: Extracts the key name from a structured key-value string using a regex statement.

As you can see below, calling these two functions outputs the data into a nice data frame.

Note, the output will look something like this:

| key_value_pair                    | agg_tokens_proba | field_name   |
|------------------------------------|-----------------|-------------|
| {"name":"Science Fair","          | 0.957804        | name        |
| date":"2023-10-06","               | 0.326663        | date        |
| participants":["Alice","           | 0.999999        | participants |

Pretty high confidence score for the name and participants fields, but not so much for the date.

In [36]:
pair_probs = logprobs_handler.calculate_words_probas(logprobs_formatted)
pair_df = pd.DataFrame(pair_probs, columns=['key_value_pair', 'agg_tokens_proba'])
pair_df['field_name'] = pair_df['key_value_pair'].apply(logprobs_handler.extract_key_name)
pair_df = pair_df[pair_df['field_name'].notna()]
display(pair_df)

Unnamed: 0,key_value_pair,agg_tokens_proba,field_name
0,"{""name"":""Science Fair"",""",0.957804,name
1,"date"":""2023-10-06"",""",0.326663,date
2,"participants"":[""Alice"",""",0.999999,participants


## Putting it all Together

Below will run the example end-to-end.

In [29]:
client = AzureOpenAI(
  azure_endpoint = os.environ.get("OPENAI_ENDPOINT"), 
  api_key=os.environ.get("OPENAI_API_KEY"),
  azure_deployment=os.environ.get("OPENAI_MODEL"),
  api_version="2024-10-21"
)

class CalendarEvent(BaseModel):
    name: str
    date: str
    participants: list[str]
    

response_raw = client.beta.chat.completions.parse(messages=[{'role': 'user', 'content': 'Alice and Bob are going to a science fair on Friday.'}], model=os.environ.get("OPENAI_MODEL"), logprobs=True, response_format=CalendarEvent)

print(response_raw)

print(response_raw.choices[0].message.content)

response_logprobs = response_raw.choices[0].logprobs.content if hasattr(response_raw.choices[0], 'logprobs') else []

logprobs_formatted = logprobs_handler.format_logprobs(response_logprobs)

confidence = logprobs_handler.process_logprobs(logprobs_formatted)

print(confidence)

data = json.loads(response_raw.choices[0].message.content)
csv_file = "output_data_with_confidence.csv"

with open(csv_file, mode="w", newline="") as file:
    writer = csv.writer(file)

    writer.writerow(["Field", "Value", "Confidence Score"])
    for field, value in data.items():
        confidence_score = confidence.get(field, 0) * 100 
        writer.writerow([field, json.dumps(value) if isinstance(value, list) else value, f"{confidence_score:.2f}%"])

print(f"Extracted data with confidence scores saved to {csv_file}")

ParsedChatCompletion[CalendarEvent](id='chatcmpl-B0dQv4id9nFEIRxI6Uc88jqFUAHO6', choices=[ParsedChoice[CalendarEvent](finish_reason='stop', index=0, logprobs=ChoiceLogprobs(content=[ChatCompletionTokenLogprob(token='{"', bytes=[123, 34], logprob=-1.9361265e-07, top_logprobs=[]), ChatCompletionTokenLogprob(token='name', bytes=[110, 97, 109, 101], logprob=0.0, top_logprobs=[]), ChatCompletionTokenLogprob(token='":"', bytes=[34, 58, 34], logprob=0.0, top_logprobs=[]), ChatCompletionTokenLogprob(token='Science', bytes=[83, 99, 105, 101, 110, 99, 101], logprob=-0.010148544, top_logprobs=[]), ChatCompletionTokenLogprob(token=' Fair', bytes=[32, 70, 97, 105, 114], logprob=-6.392203e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='","', bytes=[34, 44, 34], logprob=-0.042743146, top_logprobs=[]), ChatCompletionTokenLogprob(token='date', bytes=[100, 97, 116, 101], logprob=0.0, top_logprobs=[]), ChatCompletionTokenLogprob(token='":"', bytes=[34, 58, 34], logprob=0.0, top_logprobs=[]), Cha