In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
from csv import excel
import pandas as pd

# Load the Excel file
excel_path = "template_Study18_HL_HM.xlsx"
study_name = excel_path.split('.')[0]

excel_file = pd.ExcelFile(excel_path)

# Get all sheet names
sheet_names = excel_file.sheet_names

# Export each sheet as a CSV
csv_paths = []
for sheet in sheet_names:
    df = excel_file.parse(sheet)
    csv_path = f"{sheet}.csv"
    df.to_csv(csv_path, index=False)
    csv_paths.append(csv_path)

csv_paths


['Conversation.csv',
 'memory 1.csv',
 'memory 2.csv',
 'Conversation Idea Units .csv',
 'Memory 1 Idea Units .csv',
 'Memory 2 Idea Units.csv',
 'Memory 1 PT.csv',
 'Memory 2 PT.csv',
 'Memory Speaker 18-1 PT.csv',
 'Memory Speaker 18-2 PT.csv']

In [None]:
import spacy

nlp = spacy.load("en_core_web_lg")

# Define filler words that we want to remove when they appear as part of a multi-unit turn.
FILLERS = {"oh", "ok", "um", "uh", "ah","okay",'I know','yeah','yep','mhm','Um'}

# A set of coordinating conjunctions used for splitting clauses.
COORD_CONJS = {"and", "but", "so", "or"}

# A set of subordinating conjunctions for splitting subordinate clauses.
# Here we include both "because" and "when".
SUBORD_CONJS = {"because", "when"}

def is_filler_only(text):
    """
    Returns True if the text (after tokenization) consists only of filler words
    (ignoring punctuation and whitespace).
    """
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if not token.is_punct and not token.is_space]
    return bool(tokens) and all(token in FILLERS for token in tokens)

def extract_idea_units(text, turn_number=1):
    """
    Extracts idea units from a speaker turn while preserving original wording.

    For each sentence (as detected by spaCy), we use token offsets to determine
    split boundaries based on the following heuristics:
      • A new clause is started if a coordinating conjunction (and, but, so, or)
        appears (except at the very start).
        - For "but", we always split.
        - For "and"/"so"/"or", we only split if the following token starts a clause
          (i.e., has a subject dependency) or is a subordinating marker (dep = "mark"),
          or if the next token is in SUBORD_CONJS.
      • A new clause is also started if a subordinating conjunction such as "because"
        or "when" appears (except at the very start). If "because" appears anywhere
        in the sentence, ignore "when" as a boundary.
      • A comma followed by a token with a subject dependency marks a boundary,
        except if the comma is preceded by "like" (to keep descriptive "like" phrases together).

    Additionally, any occurrence of the notation "(inaudible)" is removed from the idea units.

    Finally, if the turn contains multiple idea units, any that are comprised solely
    of filler words (e.g., "oh", "ok", "um") are removed. If the turn is only one unit,
    it is preserved even if it is just filler.

    Additionally, if any idea unit consists solely of connector words such as
    "so", "but", or "yeah", it is merged with the following idea unit.

    Returns a list of tuples: (turn_number, idea_unit_text)
    """
    text = text.replace("\n", " ")
    doc = nlp(text)
    idea_units = []

    # Process sentence by sentence.
    for sent in doc.sents:
        sent_text = sent.text
        sent_start = sent.start_char  # starting char offset of sentence in doc
        tokens = list(sent)

        # Check if "because" is present anywhere in the sentence.
        has_because = any(token.text.lower() == "because" for token in tokens)

        # Collect split boundaries as character indices (relative to the sentence)
        boundaries = [0]  # Always start at the beginning

        for i, token in enumerate(tokens):
            # Calculate the relative index within the sentence.
            rel_idx = token.idx - sent_start
            token_lower = token.text.lower()

            # Heuristic: Subordinate conjunctions (e.g., "because" or "when")
            if token_lower in SUBORD_CONJS and i > 0:
                if token_lower == "because":
                    boundaries.append(rel_idx)
                # Only split on "when" if "because" is not present in the sentence.
                elif token_lower == "when" and not has_because:
                    boundaries.append(rel_idx)

            # Heuristic: Coordinating conjunctions
            if token_lower in COORD_CONJS and i > 0:
                # Always split on "but"
                if token_lower == "but":
                    boundaries.append(rel_idx)
                else:
                    # Split if the following token starts a clause (nsubj/nsubjpass),
                    # or if the following token is a subordinating marker (dep_ == "mark"),
                    # or if the next token is in the set of subordinating conjunctions.
                    if i + 1 < len(tokens):
                        next_tok = tokens[i+1]
                        if (
                            next_tok.dep_ in {"nsubj", "nsubjpass", "mark"}
                            or next_tok.text.lower() in SUBORD_CONJS
                        ):
                            boundaries.append(rel_idx)

            # Heuristic: Comma boundaries
            if token.text == "," and i + 1 < len(tokens):
                if tokens[i+1].dep_ in {"nsubj", "nsubjpass"}:
                    # Only prevent a split if the token immediately before the comma is "like"
                    if i > 0 and tokens[i-1].text.lower() != "like":
                        boundaries.append(rel_idx + len(token.text))



        # Ensure the sentence end is included.
        if boundaries[-1] != len(sent_text):
            boundaries.append(len(sent_text))

        # Remove duplicate boundaries and sort them.
        boundaries = sorted(set(boundaries))

        # Slice the sentence text into segments based on the boundaries.
        for j in range(len(boundaries) - 1):
             segment = sent_text[boundaries[j]:boundaries[j+1]].strip()
             # Remove the notation "(inaudible)"
             segment = segment.replace("(inaudible)", "").strip()
             # --- NEW: strip leading/trailing filler tokens ---
             words = segment.split()
             # drop any filler at the beginning
             while words and words[0].lower() in FILLERS:
                 words.pop(0)
             # drop any filler at the end
             while words and words[-1].lower() in FILLERS:
                 words.pop()
             segment = " ".join(words).strip()

             if segment:
                 idea_units.append(segment)

    # If there are multiple idea units in the turn, remove those that are filler-only.
    if len(idea_units) > 1:
        idea_units = [iu for iu in idea_units if not is_filler_only(iu)]

    # ----------------- POST PROCESSING: Merge Standalone Connector Segments ----------------- #
    CONNECTORS = {"so", "but", "yeah", "yes", "no", "anyways"}
    def is_connector_only(segment):
        words = [w.strip(".,!?").lower() for w in segment.split()]
        return bool(words) and all(word in CONNECTORS for word in words)

    merged_units = []
    i = 0
    while i < len(idea_units):
        # If the current segment is a connector-only segment,
        # merge it with following connector-only segments and the first non-connector segment.
        if is_connector_only(idea_units[i]):
            group = [idea_units[i]]
            i += 1
            while i < len(idea_units) and is_connector_only(idea_units[i]):
                group.append(idea_units[i])
                i += 1
            if i < len(idea_units):
                group.append(idea_units[i])
                i += 1
                merged_units.append(" ".join(group).strip())
            else:
                # If no non-connector segment follows, merge with the previous unit if available.
                if merged_units:
                    merged_units[-1] = merged_units[-1] + " " + " ".join(group).strip()
                else:
                    merged_units.append(" ".join(group).strip())
        else:
            merged_units.append(idea_units[i])
            i += 1

    idea_units = merged_units
        # ----------------- NEW POST PROCESSING: Merge Any Very-Short Segment With the Next ----------------- #
    merged_incomplete_units = []
    i = 0
    while i < len(idea_units):
        current_unit = idea_units[i]
        # If this unit is very short (<3 words) and there's a next unit, merge them.
        if i < len(idea_units) - 1 and len(current_unit.split()) < 3:
            current_unit = f"{current_unit} {idea_units[i+1]}"
            merged_incomplete_units.append(current_unit.strip())
            i += 2
        else:
            merged_incomplete_units.append(current_unit)
            i += 1

    idea_units = merged_incomplete_units


    # ----------------- NEW POST PROCESSING: Merge Segments Lacking a Subject ----------------- #
    # If an idea unit (other than the first one) does not contain a subject (e.g., "nsubj" or "nsubjpass"),
    # we merge it with the preceding unit.
    merged_subject_units = []
    for i, unit in enumerate(idea_units):
        doc_unit = nlp(unit)
        if i > 0 and not any(token.dep_ in {"nsubj", "nsubjpass"} for token in doc_unit):
            merged_subject_units[-1] = merged_subject_units[-1] + " " + unit
        else:
            merged_subject_units.append(unit)
    idea_units = merged_subject_units

    # Return each idea unit paired with the turn number.
    return [(turn_number, iu) for iu in idea_units]


# ----------------- SAMPLE USAGE ----------------- #
if __name__ == "__main__":
    texts = [
        "I think I was 14"
    ]

    turn_number = 1
    for text in texts:
        print(f"Original text: {text}")
        ius = extract_idea_units(text, turn_number)
        for turn_num, iu_text in ius:
            print(f"[Turn {turn_num}] {iu_text}")
        print("---")


Original text: I think I was 14
[Turn 1] I think I was 14
---


In [None]:
import pandas as pd
input_file = "Conversation.csv"
df = pd.read_csv(input_file)

idea_units_data = []
cumulative_iu_index = 1  # Start global idea unit counter

for _, row in df.iterrows():
    turn = row["turn"]
    #turn = row['order']
    subject = row["subject"]
    text = str(row["transcript"]).strip()

    if pd.notna(text) and text:  # Only process non-empty transcripts
        extracted_units = extract_idea_units(text, turn_number=turn)

        # Assign global incremental numbers for idea units across turns
        for _, iu_text in extracted_units:
            idea_units_data.append([subject, turn, cumulative_iu_index, iu_text])
            cumulative_iu_index += 1  # Increment the global idea unit index

# Create output DataFrame with cumulative numbering
output_df = pd.DataFrame(idea_units_data, columns=["Subject Pair", "Original Turn", "Idea Unit #", "Transcript"])


# Save to CSV
output_file = study_name + "_conversation.csv"
output_df.to_csv(output_file, index=False)

print(f"Processed idea units saved to {output_file}")

Processed idea units saved to template_Study18_HL_HM_conversation.csv


In [None]:
import pandas as pd
input_file = "Memory 1.csv"
df = pd.read_csv(input_file)

idea_units_data = []
cumulative_iu_index = 1  # Start global idea unit counter

for _, row in df.iterrows():
    #turn = row["turn"]
    turn = row['order']
    subject = row["subject"]
    text = str(row["transcript"]).strip()

    if pd.notna(text) and text:  # Only process non-empty transcripts
        extracted_units = extract_idea_units(text, turn_number=turn)

        # Assign global incremental numbers for idea units across turns
        for _, iu_text in extracted_units:
            idea_units_data.append([subject, turn, cumulative_iu_index, iu_text])
            cumulative_iu_index += 1  # Increment the global idea unit index

# Create output DataFrame with cumulative numbering
output_df = pd.DataFrame(idea_units_data, columns=["Subject Pair", "Original Turn", "Idea Unit #", "Transcript"])


# Save to CSV
output_file = study_name + "_memory-1.csv"
output_df.to_csv(output_file, index=False)

print(f"Processed idea units saved to {output_file}")

Processed idea units saved to template_Study18_HL_HM_memory-1.csv


In [None]:
import pandas as pd
input_file = "Memory 2.csv"
df = pd.read_csv(input_file)

idea_units_data = []
cumulative_iu_index = 1  # Start global idea unit counter

for _, row in df.iterrows():
    #turn = row["turn"]
    turn = row['order']
    subject = row["subject"]
    text = str(row["transcript"]).strip()

    if pd.notna(text) and text:  # Only process non-empty transcripts
        extracted_units = extract_idea_units(text, turn_number=turn)

        # Assign global incremental numbers for idea units across turns
        for _, iu_text in extracted_units:
            idea_units_data.append([subject, turn, cumulative_iu_index, iu_text])
            cumulative_iu_index += 1  # Increment the global idea unit index

# Create output DataFrame with cumulative numbering
output_df = pd.DataFrame(idea_units_data, columns=["Subject Pair", "Original Turn", "Idea Unit #", "Transcript"])


# Save to CSV
output_file = study_name + "_memory-2.csv"
output_df.to_csv(output_file, index=False)

print(f"Processed idea units saved to {output_file}")

Processed idea units saved to template_Study18_HL_HM_memory-2.csv
