In [49]:
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import re

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [50]:
#Maps pos tags to tags used by WordNet
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None

In [51]:
#Retrieve synset for word with specified POS tag
def get_synset(word, pos):
    wn_pos = get_wordnet_pos(pos)
    if wn_pos:
        synsets = wn.synsets(word, pos=wn_pos)
        if synsets:
            return synsets[0]
    return None

In [52]:
def get_similarity_edges(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)

    # Keep only nouns and pronouns (change this to whatever pos tags to include)
    valid_tags = ('N', 'PRP', 'WP')
    filtered = [(word, pos) for word, pos in tagged if pos.startswith(valid_tags)]

    # Return empty dict if similarity cannot be computes (<2 words)
    if len(filtered) < 2:
        return {}

    edges = {}
    for i in range(len(filtered) - 1):
        word1, pos1 = filtered[i]
        word2, pos2 = filtered[i + 1]

        syn1 = get_synset(word1, pos1)
        syn2 = get_synset(word2, pos2)

        if syn1 and syn2:
            sim = syn1.wup_similarity(syn2)
            edges[(word1.lower(), word2.lower())] = sim if sim is not None else 0.0
        else:
            edges[(word1.lower(), word2.lower())] = 0.0

    return edges

In [53]:
# Read in dataframe
df = pd.read_csv('/content/ASR transcripts - Process-train_manual_vs_asr.csv') #replace with file path

In [54]:
# Preprocessing to only keep patient speech and remove diarisation markers (Pat: and Oth:)

def extract_patient_speech(text):
    # Keep only lines that start with Pat:
    patient_lines = re.findall(r'Pat:\s*(.*?)(?=Pat:|Oth:|$)', text, flags=re.DOTALL)
    # Join them into one cleaned string
    return ' '.join(line.strip() for line in patient_lines)

# Apply to the 'asr' transcripts
df['asr_cleaned'] = df['asr'].apply(extract_patient_speech)

In [55]:
df['similarities'] = df['asr_cleaned'].apply(get_similarity_edges)