# Extract sequence of information units from cookie theft task

In [22]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')
import json
import numpy as np

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Pre-processing

In [3]:
# Read in dataframe
df = pd.read_csv('/content/ASR transcripts - Process-train_manual_vs_asr.csv') #replace with file path

In [4]:
df.head(2)

Unnamed: 0,file,asr,manual,wer,insertions,deletions,substitutions,inserted_words,deleted_words,substituted_words,Notes
0,Process-rec-057__CTD.txt,"Pat: and the girl laughing perhaps, a mother. ...","Pat: Er, two children on a stool trying to tak...",0.783951,0,123,4,[],"['er', 'two', 'children', 'on', 'a', 'stool', ...","[('woman', 'mother'), ('sandals', 'santos'), (...",
1,Process-rec-092__PFT.txt,"Pat: parcel, paper, perhaps, possibly, program...","Pat: (2 seconds) Parcel, paper, um (3 seconds)...",0.76,0,18,1,[],"['um', 'parcel', 'er', 'can', 'i', 'do', 'this...","[('programme', 'program')]",


In [5]:
# only keep transcripts for cookie theft task

df_CTD = df[df['file'].str.endswith('CTD.txt')]

In [8]:
# Preprocessing to only keep patient speech and remove diarisation markers (Pat: and Oth:)

def extract_patient_speech(text):
    # Keep only lines that start with Pat:
    patient_lines = re.findall(r'Pat:\s*(.*?)(?=Pat:|Oth:|$)', text, flags=re.DOTALL)
    # Join them into one cleaned string
    return ' '.join(line.strip() for line in patient_lines)

# Apply to the 'asr' transcripts
df_CTD['asr_cleaned'] = df_CTD['asr'].apply(extract_patient_speech)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_CTD['asr_cleaned'] = df_CTD['asr'].apply(extract_patient_speech)


## Get sequence of information units

In [18]:
# Reverse the synonyms dictionary for quick lookup
from nltk.tokenize import word_tokenize

# Dictionary of synonyms corresponding to the different CIUs

synonyms_dict = {
    "Boy": ["boy", "son", "brother", "male child"],
    "Girl": ["girl", "daughter", "sister", "female child"],
    "Woman": ["mom", "mother", "lady", "parent", "female", "adult", "grownup", "mum"],
    "Kitchen": ["kitchen", "room"],
    "Cookie": ["cookie", "biscuit", "cake", "treat"],
    "Jar": ["jar", "container", "crock", "pot"],
    "Stool": ["stool", "seat", "chair", "ladder"],
    "Sink": ["sink", "basin", "washbasin", "washbowl", "washstand", "tap"],
    "Plate": ["plate"],
    "Dishcloth": ["dishcloth", "dishrag", "rag", "cloth", "napkin", "towel"],
    "Water": ["water", "dishwater", "liquid"],
    "Window": ["window", "frame", "glass"],
    "Cupboard": ["cupboard", "closet", "shelf"],
    "Dishes": ["dish", "dishes", "cup", "cups", "counter"],
    "Curtains": ["curtain", "curtains", "drape", "drapes", "drapery", "blind", "blinds", "screen", "screens"],
    "Exterior": ["exterior", "outside", "garden", "yard", "outdoors", "backyard", "driveway", "path", "tree", "bush"],
    "Steal": ["take", "steal", "taking", "stealing"],
    "Fall": ["fall", "falling", "slip", "slipping"],
    "Wash": ["wash", "dry", "clean", "washing", "drying", "cleaning"],
    "Overflow": ["overflow", "spill", "overflowing", "spilling"]
}

# Flatten the dictionary to a reverse mapping: synonym -> information unit
synonym_to_unit = {}
for unit, synonyms in synonyms_dict.items():
    for synonym in synonyms:
        synonym_to_unit[synonym.lower()] = unit  # Convert to lowercase for case-insensitive matching

# Tokenize and extract information units from the transcript
def extract_information_units_with_synonyms(transcript):
    words = word_tokenize(transcript.lower())  # Tokenize and convert to lowercase
    words = [word.rstrip("'s") for word in words]  # Remove possessive 's

    units = []  # Store sequence of extracted information units
    for word in words:
        if word in synonym_to_unit:
            units.append(synonym_to_unit[word])  # Append the corresponding information unit

    return units

In [19]:
# Apply this function to extract IU sequences
df_CTD['sequence'] = df_CTD['asr_cleaned'].apply(extract_information_units_with_synonyms)

# Check the result
df_CTD[['asr_cleaned', 'sequence']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_CTD['sequence'] = df_CTD['asr_cleaned'].apply(extract_information_units_with_synonyms)


Unnamed: 0,asr_cleaned,sequence
0,"and the girl laughing perhaps, a mother. looks...","[Girl, Woman, Boy, Girl, Dishes, Plate]"
31,Trying to get cookies out of the cookie jar. G...,"[Cookie, Cookie, Jar, Girl, Woman, Wash, Sink,..."
68,Two kids going into a cupboard on the stool wh...,"[Cupboard, Stool, Woman, Sink, Water, Sink, Wa..."
87,uh and that's in the kitchen. that is the kitc...,"[Kitchen, Kitchen, Woman, Woman, Wash, Dishes,..."
106,It's children taking cookies from a cookie jar...,"[Steal, Cookie, Cookie, Jar, Woman, Stool, Fal..."


## Measure distance between consecutive CIUs

In [21]:
# Two dimensional coordinates of the CIUs based on the size of the original image (850 x 592)

ciu_coordinates = {
    "Boy": (295, 196),
    "Girl": (134, 354),
    "Woman": (501, 279),
    "Kitchen": (630, 300),
    "Cookie": (200, 70),
    "Jar": (200, 100),
    "Stool": (226, 438),
    "Sink": (610, 344),
    "Plate": (564, 228),
    "Dishcloth": (539, 284),
    "Water": (589, 461),
    "Window": (678, 190),
    "Cupboard": (204, 124),
    "Dishes": (765, 386),
    "Curtains": (679, 125),
    "Exterior": (694, 245),
    "Steal": (233, 105),
    "Fall": (263, 369),
    "Wash": (550, 255),
    "Overflow": (604, 376)
}

In [27]:

# euclidean distance between two adjacent words
def euclidean_distance(coord1, coord2):
    return np.linalg.norm(np.array(coord1) - np.array(coord2))

# store tuples of two connected edges (i.e., consecutive CIUs) and their distance in a dictionary
def build_ciu_edge_dict(ciu_sequence, ciu_coordinates=ciu_coordinates):
    edge_dict = {}

    for i in range(len(ciu_sequence) - 1):
        ciu1, ciu2 = ciu_sequence[i], ciu_sequence[i + 1]
        if ciu1 in ciu_coordinates and ciu2 in ciu_coordinates:
            dist = euclidean_distance(ciu_coordinates[ciu1], ciu_coordinates[ciu2])
            edge_dict[(ciu1, ciu2)] = dist
        else:
            print(f"Warning: One of the CIUs '{ciu1}' or '{ciu2}' is missing in coordinates")

    return edge_dict

In [31]:
#apply to df
df_CTD['CIU_dist'] = df_CTD['sequence'].apply(build_ciu_edge_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_CTD['CIU_dist'] = df_CTD['sequence'].apply(build_ciu_edge_dict)
