## Process Switchboard

This notebook contains snippets of code that details how I processed the raw switchboard-1 corpus for fine-tuning Llama & generating ngram models.

Vincent Danys
2024-07-15

In [39]:
import re
import os
from scripts.settings import Config

class Transcript:
  def __init__(self, id, lines, topic="UNAVAILABLE"):
    self.id = id
    self.lines = lines
    self.topic = topic

class Line:
  def __init__(self, speaker, id, start, end, text):
    self.speaker = speaker
    self.id = id
    self.start = start
    self.end = end
    self.text = text

# meta data
SW_DIR = Config.SW_SOURCE

# output directories
OUTPUT_WITH_SPEAKERS = Config.SW_WITH_SPEAKER_DIR
OUTPUT_NO_SPEAKERS = Config.SW_NO_SPEAKER_DIR

try:
  os.mkdir(OUTPUT_WITH_SPEAKERS)
  os.mkdir(OUTPUT_NO_SPEAKERS)
except: pass

# speaker prefixes. To generate data with no speaker information, see below.
Speaker1 = "Speaker 1: "
Speaker2 = "Speaker 2: "

#### get_swb_file_list
switchboard-1 should contain `20/` `21/` ... `49/` under which sit `20/2001` and etc. This function returns a list of all switchboard transcript files, sorted by conversation.

**Note**: this function relies on the existence of `AAREADME.text` in the switchboard directory, as that is where all the file directories are extracted from.

E.g.,:
* `return[0] = "/21/2101/sw2101A-ms98-a-trans.text"`
* `return[1] = "/21/2101/sw2101A-ms98-a-word.text"`
* `return[5] = "/21/2102/sw2102A-ms98-a-trans.text"`
* etc.


In [42]:
def get_swb_file_list():
  swb_files = []
  
  with open(SW_DIR + "AAREADME.text", "r") as readme_file:
    readme_lines = readme_file.read().split('\n')

  for i in range(17, len(readme_lines), 5):

    subdir = readme_lines[i]
    files = readme_lines[i+1:i+5]

    for file in files:
      swb_files.append(SW_DIR + subdir[:2] + "/" + subdir + "/" + file.strip())

  return swb_files

In [49]:
print(get_swb_file_list())

['/disk/data2/s2482679/sw-llama-ngrams/scripts/../data/switchboard-1/20/2001/sw2001A-ms98-a-trans.text', '/disk/data2/s2482679/sw-llama-ngrams/scripts/../data/switchboard-1/20/2001/sw2001A-ms98-a-word.text', '/disk/data2/s2482679/sw-llama-ngrams/scripts/../data/switchboard-1/20/2001/sw2001B-ms98-a-trans.text', '/disk/data2/s2482679/sw-llama-ngrams/scripts/../data/switchboard-1/20/2001/sw2001B-ms98-a-word.text', '/disk/data2/s2482679/sw-llama-ngrams/scripts/../data/switchboard-1/20/2005/sw2005A-ms98-a-trans.text', '/disk/data2/s2482679/sw-llama-ngrams/scripts/../data/switchboard-1/20/2005/sw2005A-ms98-a-word.text', '/disk/data2/s2482679/sw-llama-ngrams/scripts/../data/switchboard-1/20/2005/sw2005B-ms98-a-trans.text', '/disk/data2/s2482679/sw-llama-ngrams/scripts/../data/switchboard-1/20/2005/sw2005B-ms98-a-word.text', '/disk/data2/s2482679/sw-llama-ngrams/scripts/../data/switchboard-1/20/2006/sw2006A-ms98-a-trans.text', '/disk/data2/s2482679/sw-llama-ngrams/scripts/../data/switchboard-1

In [43]:
assert len(get_swb_file_list()) / 4 == 2438 # full, unchanged switchboard-1 dataset should pass this test

#### clean_transcript_swb
Takes the transcript, in special line objects, and cleans the text.
* Removes [laughter], [noise], and [vocalized-noise]
* Removes any other [laughter-BLANK] combination
* Converts word fragments into words. E.g., "I gue[ss]" -> "I guess"
* Removes all text between `<b_aside>` and `<e_aside>`
* Curly bracket words are treated as normal words. "{federaldes}" -> "federaldes"
* Capitalizes the start of each line.

In [44]:
# Adapted code originally written by Mai (s2324822)
def clean_transcript_swb(transcript_lines: list[Line]):

  clean_lines = []

  for line in transcript_lines:

    text = line.text

    # remove speech events
    clean = re.sub(r'\[laughter\]|\[noise\]|\[vocalized-noise\]', '', text)

    # check if further cleaning required
    if '[' in text:
      clean_tokens = []
      tokens = clean.split(' ')

      for token in tokens:

        # special laughter case
        if '[laughter-' in token:
          token = re.sub(r'\[laughter-', '', token)

        # partial completions
        if '-[' in token or ']-' in token:
          token = re.sub(r'\-\[|\]\-', '', token)
          
        token = re.sub(r'\[|\]', '', token)
        clean_tokens.append(token)

      clean = ' '.join(clean_tokens)

    # <aside> text
    if "<b_aside>" in clean:
      clean = re.sub(r'<b_aside>.*?<e_aside>', '', clean)

    # special pronounciation (eg. 'because_1')
    found = [i for i in range(len(clean)) if clean.startswith('_', i)]
    if len(found) > 0:
      f_indxs = [-2] + found + [len(clean)]
      clean = ''.join([clean[f_indxs[i-1]+2:f_indxs[i]] for i in range(1, len(f_indxs))])

    clean = re.sub(' +', ' ', clean.strip())

    # curly bracket text turned into normal words
    clean = re.sub('\{|\}', '', clean)

    # capitalize start of line for the ease of Llama 3
    clean = clean.capitalize()
    
    if clean != "": clean_lines.append(Line(line.speaker, line.id, line.start, line.end, clean))

  return clean_lines

#### get_speaker_lines
Switchboard transcribes each conversation into 2 separate files: speaker A and speaker B. The function `get_speaker_lines` converts one of these files into a list of `Line` objects.

In [45]:
def get_speaker_lines(speaker, dir):

  with open(dir, "r") as file:
    lines = file.read().split('\n')

  transcript_lines = []

  for line in lines:

    if line == "": continue

    # remove extra spaces
    if " "*6 in line:
      line = line.replace(" "*6, " ")
      line = line.replace(" "*5, " ")
    line = line.replace("	", " ")

    # convert into line object
    line_parts = line.split(" ", 3)
    if line_parts[3] == "[silence]": continue

    transcript_lines.append(Line(
      speaker,
      line_parts[0],
      float(line_parts[1]),
      float(line_parts[2]),
      line_parts[3]
    ))

  return transcript_lines

#### conversation_to_file
Takes a list of `Line` objects and writes it in text format into a file.
Optional settings:
* `speakers: bool` - include speaker prefixes for each line
* `text: bool` - include utterance text for each line
* `start: bool` - include utterance start time
* `end: bool` - include utterance end time
* `topic: bool` - include the conversation topic as a header for the transcript

In [46]:
def conversation_to_file(transcript, file_dir, speakers=True, text=True, start=False, end=False, topic=False):

  with (open(file_dir, "w")) as file:

    if topic:
      file.write(f"Topic: {transcript.topic}\n")

    for line in transcript.lines:

      if start:
        file.write(f"[{line.start:.2f} ")

      if end:
        file.write(f"[{line.end:.2f}] ")

      if speakers:
        file.write(line.speaker)

      if text:
        file.write(line.text)
      
      file.write("\n")

#### main
Reads the filenames from `file-list.text`. Reads transcript files, cleans up the transcripts, and writes them to a directory with speakers and with no speakers.

In [47]:
def main():
  
  global get_swb_file_list

  file_list = get_swb_file_list()

  for i in range(0, len(file_list), 4):

    transcript_id = file_list[i].split("/")[-2]
    speaker_a_dir = file_list[i]
    # file_list[i+1] is the transcript word-by-word
    speaker_b_dir = file_list[i+2]
    # file_list[i+3] is the transcript word-by-word

    # get conversation transcript
    speaker_a_lines = get_speaker_lines(Speaker1, speaker_a_dir)
    speaker_b_lines = get_speaker_lines(Speaker2, speaker_b_dir)
    transcript = Transcript(transcript_id, speaker_a_lines + speaker_b_lines)
    transcript.lines.sort(key=lambda x: x.start)

    # clean transcript
    transcript.lines = clean_transcript_swb(transcript.lines)

    # write transcript with speakers
    conversation_to_file(transcript, OUTPUT_WITH_SPEAKERS + transcript_id + ".text", speakers=True)

    # write transcript without speakers
    conversation_to_file(transcript, OUTPUT_NO_SPEAKERS + transcript_id + ".text", speakers=False)
    

In [48]:
main()