In [19]:
from typing import Final

from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
import copy

from utils import join_names, insert_sentence_after_period
from utils.persistent import save_nl2sf, load_nl2sf, save_smcdel_sf, load_smcdel_sf
from models.openai_llm import OpenAILLM
from utils.prompt import get_prompt
from executors.smcdel import SMCDEL

In [20]:
model_name: Final[str] = "gpt-3.5-turbo"
random_state: Final[int] = 1
sample_size: Final[int] = 20

# Data

## 1. Load Data

In [21]:
dataset = load_dataset("sileod/mindgames", cache_dir='./data')
train = pd.DataFrame(dataset['train'])
val = pd.DataFrame(dataset['validation'])
test = pd.DataFrame(dataset['test'])

## 2. Data Exploration

data showcase

In [22]:
train.head(1)

Unnamed: 0,premise,smcdel_problem,n_announcements,pbcheck,hypothesis,setup,hypothesis_depth,n_agents,label,names,index,s-l,deberta_pred,deberta_confidence,difficulty
0,There are four persons. Everyone is visible to...,"VARS 1,2,3,4 LAW Top OBS Agenta:1 Agentb:2 Age...",2,"VARS 0,1,2,3,4 LAW Top OBS Agenta:1 Agentb:2 A...",Nichole can now know whether Aubrey can know w...,internal,1,4,entailment,"[Aubrey, Jeremy, William, Nichole]",61652,internal-1,1,0.998557,0.001443


check splitted data has same columns

In [23]:
print("Train, val, test should have same cols: ", list(train.columns) == list(val.columns) == list(test.columns))

Train, val, test should have same cols:  True


list all columns

In [24]:
list(train.columns)

['premise',
 'smcdel_problem',
 'n_announcements',
 'pbcheck',
 'hypothesis',
 'setup',
 'hypothesis_depth',
 'n_agents',
 'label',
 'names',
 'index',
 's-l',
 'deberta_pred',
 'deberta_confidence',
 'difficulty']

setup types/category

In [25]:
train.value_counts('setup')

setup
internal           2829
forehead           2800
explicit           2793
forehead_mirror    2752
Name: count, dtype: int64

## 3. Data Preprocessing

In [26]:
def preprocess(data: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess the data and only keep the necessary columns
    :param data 
    :return: 
    """

    data = copy.deepcopy(data)

    # Preprocess context and question
    data['formatted_names'] = data['names'].apply(lambda x: join_names(x))
    data['formatted_names'] = data['formatted_names'].apply(lambda x: f"Their names are {x}.")
    data['context'] = data[['premise', 'formatted_names']].apply(
        lambda x: insert_sentence_after_period(x['premise'], x['formatted_names']), axis=1)

    # Preprocess smcdel
    data['target_sf'] = data['smcdel_problem']

    # label
    data['target_label'] = data['label'].apply(lambda x: True if x == 'entailment' else False)

    # setup
    data['setup'] = data['setup'].astype('category')

    # Drop unnecessary columns
    keep_columns = ['setup', 'context', 'hypothesis', 'target_sf', 'target_label', 'n_announcements']
    data = data[keep_columns]

    return data


train = preprocess(train)
val = preprocess(val)
test = preprocess(test)

preprocessed data showcase

In [27]:
train.head(5)

Unnamed: 0,setup,context,hypothesis,target_sf,target_label,n_announcements
0,internal,There are four persons. Their names are Aubrey...,Nichole can now know whether Aubrey can know w...,"VARS 1,2,3,4 LAW Top OBS Agenta:1 Agentb:2 Age...",True,2
1,internal,"There are four persons. Their names are Maria,...",Maria can now know that Ernestine can know whe...,"VARS 1,2,3,4 LAW Top OBS Agenta:1 Agentb:2 Age...",False,2
2,internal,"There are two persons. Their names are Donna, ...",Gabriel can now know that someone is thirsty.,"VARS 1,2 LAW Top OBS Agenta:1 Agentb:2 VALID? ...",True,2
3,forehead_mirror,"There are three persons. Their names are Leon,...",Kevin can now know that Tracie can know that n...,"VARS 1,2,3 LAW Top OBS Agenta:1,2,3 Agentb:1,2...",False,1
4,forehead,"There are four persons. Their names are John, ...",Gregory can now know that Chasity can know tha...,"VARS 1,2,3,4 LAW Top OBS Agenta:2,3,4 Agentb:1...",False,1


## 4. Prompt Example preparation 

In [28]:
internal_sample = train[train['setup'] == 'internal'].sample(1, random_state=random_state).iloc[0]
forehead_sample = train[train['setup'] == 'forehead'].sample(1, random_state=random_state).iloc[0]
explicit_sample = train[train['setup'] == 'explicit'].sample(1, random_state=random_state).iloc[0]
forehead_mirror_sample = train[train['setup'] == 'forehead_mirror'].sample(1, random_state=random_state).iloc[0]


def choose_example(setup: str) -> pd.Series:
    """
    Choose an example from the train set based on the setup
    :param setup: 
    :return: 
    """
    if setup == 'internal':
        return internal_sample
    elif setup == 'forehead':
        return forehead_sample
    elif setup == 'explicit':
        return explicit_sample
    elif setup == 'forehead_mirror':
        return forehead_mirror_sample
    else:
        raise ValueError(f"Invalid setup: {setup}")


# ToM-LM Model

## 1. Model Initialization

In [29]:
model = OpenAILLM(model_name=model_name)

## 2. Test Samples Preparation

In [30]:
samples = test.sample(sample_size, random_state=random_state)

# Add columns for the predicted sf, predicated_label
samples['predicted_sf'] = None
samples['predicted_label'] = None

## 3. SF Prediction/Generation

In [31]:
def nl2sf_predict(row: pd.Series):
    setup = row['setup']
    example = choose_example(setup)
    context = row['context']
    hypothesis = row['hypothesis']
    prompt = get_prompt(
        example_context=example['context'],
        example_hypothesis=example['hypothesis'],
        example_sf=example['target_sf'],
        problem_context=context,
        problem_hypothesis=hypothesis
    )
    sf = model.complete(prompt)
    row['predicted_sf'] = sf
    return row


for i, (index, row) in enumerate(samples.iterrows()):
    print(f"Processing sample {i + 1}/{len(samples)}")
    samples.loc[index] = nl2sf_predict(row)
save_nl2sf(samples)

Processing sample 1/20
Processing sample 2/20
Processing sample 3/20
Processing sample 4/20
Processing sample 5/20
Processing sample 6/20
Processing sample 7/20
Processing sample 8/20
Processing sample 9/20
Processing sample 10/20
Processing sample 11/20
Processing sample 12/20
Processing sample 13/20
Processing sample 14/20
Processing sample 15/20
Processing sample 16/20
Processing sample 17/20
Processing sample 18/20
Processing sample 19/20
Processing sample 20/20


## 4. SMCDEL Prediction

In [32]:
samples = load_nl2sf()
samples.head(1)

Unnamed: 0,setup,context,hypothesis,target_sf,target_label,n_announcements,predicted_sf,predicted_label
3667,explicit,There are three persons. Their names are Dorot...,Dorothy can now know whether or not everyone p...,"VARS 1,2,3 LAW Top OBS Agenta:1,2 Agentb:2,3 A...",False,0,"VARS 1,2,3 LAW Top OBS Agenta:1,2,3 Agentb:1,2...",


In [33]:
def sf_predict(row: pd.Series):
    sf = row['predicted_sf']
    try:
        result = SMCDEL(text=sf)
    except ValueError as e:
        result = None
    row['predicted_label'] = result
    return row


samples = samples.apply(sf_predict, axis=1)
save_smcdel_sf(samples)

In [34]:
load_smcdel_sf()

Unnamed: 0,setup,context,hypothesis,target_sf,target_label,n_announcements,predicted_sf,predicted_label
3667,explicit,There are three persons. Their names are Dorot...,Dorothy can now know whether or not everyone p...,"VARS 1,2,3 LAW Top OBS Agenta:1,2 Agentb:2,3 A...",False,0,"VARS 1,2,3 LAW Top OBS Agenta:1,2,3 Agentb:1,2...",True
2824,explicit,There are four persons. Their names are Pamela...,Bettye can now know that Pamela can know wheth...,"VARS 1,2,3,4 LAW Top OBS Agenta:1,4 Agentb:3 A...",False,3,"VARS 1,2,3,4 LAW Top OBS Agenta:1,2,3,4 Agentb...",
2395,forehead_mirror,"There are four persons. Their names are Lewis,...",Lewis can now know that Lawrence can know whet...,"VARS 1,2,3,4 LAW Top OBS Agenta:1,2,3,4 Agentb...",True,0,"VARS 1,2,3,4 LAW Top OBS Agenta:1,2,3,4 VALID?...",
420,forehead_mirror,"There are four persons. Their names are Sarah,...",Alfred can now know that Boyd can know whether...,"VARS 1,2,3,4 LAW Top OBS Agenta:1,2,3,4 Agentb...",True,2,"VARS 1,2,3,4 LAW Top OBS Agenta:1,2,3,4 Agentb...",
2313,forehead,"There are two persons. Their names are Alan, a...",Scott can now know whether Alan's forehead is ...,"VARS 1,2 LAW Top OBS Agenta:2 Agentb:1 VALID? ...",True,2,"VARS 1,2 LAW Top OBS Agenta:1 Agentb:2 VALID? ...",
1828,internal,There are four persons. Their names are Benjam...,Luther can now know that Richard can know that...,"VARS 1,2,3,4 LAW Top OBS Agenta:1 Agentb:2 Age...",True,2,"VARS 1,2,3,4 LAW Top OBS Agenta:1 Agentb:2 Age...",False
364,forehead,There are four persons. Their names are Arthur...,John can now know whether Buffy can know wheth...,"VARS 1,2,3,4 LAW Top OBS Agenta:2,3,4 Agentb:1...",True,1,"VARS 1,2,3,4 LAW Top OBS Agenta:1,2,3,4 Agentb...",True
3186,forehead,There are four persons. Their names are Barbar...,Barbara can now know whether Carol can know wh...,"VARS 1,2,3,4 LAW Top OBS Agenta:2,3,4 Agentb:1...",False,1,"VARS 1,2,3,4 LAW Top OBS Agenta:1,2,3,4 Agentb...",True
3221,forehead,There are three persons. Their names are Rache...,Kenneth can now know whether Kenneth's forehea...,"VARS 1,2,3 LAW Top OBS Agenta:2,3 Agentb:1,3 A...",False,2,"VARS 1,2,3 LAW Top OBS Agenta:1,2,3 Agentb:1,2...",
2899,internal,There are four persons. Their names are Christ...,Michael can now know whether Michael is thirsty.,"VARS 1,2,3,4 LAW Top OBS Agenta:1 Agentb:2 Age...",True,1,"VARS 1,2,3,4 LAW Top OBS Agenta:1 Agentb:2 Age...",True


# Evaluation

In [105]:
def execution_rate(df: pd.DataFrame) -> dict:
    """
    Calculate the execution rate
    :param df: 
    :return: 
    """
    drop_count = df['predicted_label'].isnull().any(axis="rows").sum()
    keep_count = len(df) - drop_count
    return {
        'execution_count': keep_count,
        'execution_rate': f"{(keep_count / len(df)) * 100:.2f}%",
        'drop_count': drop_count,
        'drop_rate': f"{(drop_count / len(df)) * 100:.2f}%"
    }


def metrics(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the metrics
    :param df: 
    :return: 
    """
    # Remove the rows with unexpected results
    df = df.dropna(axis="rows", how="any").copy()
    # make it to bool
    df['predicted_label'] = df['predicted_label'].astype(bool, copy=True)
    # metrics group by setup
    accuracy = df.groupby('setup').apply(
        lambda x: accuracy_score(x['target_label'], x['predicted_label']),
        include_groups=False
    )
    precision = df.groupby('setup').apply(
        lambda x: precision_score(x['target_label'], x['predicted_label'], zero_division=0),
        include_groups=False
    )
    recall = df.groupby('setup').apply(
        lambda x: recall_score(x['target_label'], x['predicted_label'], zero_division=0),
        include_groups=False
    )
    f1 = df.groupby('setup').apply(
        lambda x: f1_score(x['target_label'], x['predicted_label'], zero_division=0),
        include_groups=False
    )
    m = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    mdf = pd.DataFrame(m)
    mdf.loc['overall'] = {
        'accuracy': accuracy_score(df['target_label'], df['predicted_label']),
        'precision': precision_score(df['target_label'], df['predicted_label'], zero_division=0),
        'recall': recall_score(df['target_label'], df['predicted_label'], zero_division=0),
        'f1': f1_score(df['target_label'], df['predicted_label'], zero_division=0)
    }
    return mdf

In [106]:
execution_rate(samples)

{'execution_count': 19,
 'execution_rate': '95.00%',
 'drop_count': 1,
 'drop_rate': '5.00%'}

In [107]:
metrics(samples)

Unnamed: 0_level_0,accuracy,precision,recall,f1
setup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
explicit,0.0,0.0,0.0,0.0
forehead,0.333333,0.333333,1.0,0.5
forehead_mirror,0.5,0.0,0.0,0.0
internal,0.666667,1.0,0.5,0.666667
overall,0.384615,0.375,0.5,0.428571
