## Process Switchboard

This notebook contains snippets of code that details how I processed the raw switchboard-1 corpus for fine-tuning Llama & generating ngram models.

Vincent Danys
2024-07-15

In [None]:
import re
import os
from scripts.settings import Config
from transformers import AutoTokenizer

class Transcript:
  def __init__(self, id, lines, topic="UNAVAILABLE"):
    self.id = id
    self.lines = lines
    self.topic = topic

class Line:
  def __init__(self, speaker, id, start, end, text):
    self.speaker = speaker
    self.id = id
    self.start = start
    self.end = end
    self.text = text

# meta data
SW_DIR = Config.SW_SOURCE

# output directories
OUTPUT_WITH_SPEAKERS = Config.SW_WITH_SPEAKER_DIR
OUTPUT_NO_SPEAKERS = Config.SW_NO_SPEAKER_DIR
OUTPUT_INSTRUCT = Config.SW_INSTRUCT_NO_SPEAKERS

try: os.mkdir(OUTPUT_WITH_SPEAKERS)
except: pass
try: os.mkdir(OUTPUT_NO_SPEAKERS)
except: pass
try: os.mkdir(OUTPUT_INSTRUCT)
except: pass


# speaker prefixes. To generate data with no speaker information, see below.
Speaker1 = "Speaker 1: "
Speaker2 = "Speaker 2: "

#### get_swb_file_list
switchboard-1 should contain `20/` `21/` ... `49/` under which sit `20/2001` and etc. This function returns a list of all switchboard transcript files, sorted by conversation.

**Note**: this function relies on the existence of `AAREADME.text` in the switchboard directory, as that is where all the file directories are extracted from.

E.g.,:
* `return[0] = "/21/2101/sw2101A-ms98-a-trans.text"`
* `return[1] = "/21/2101/sw2101A-ms98-a-word.text"`
* `return[5] = "/21/2102/sw2102A-ms98-a-trans.text"`
* etc.


In [None]:
def get_swb_file_list():
  swb_files = []
  
  with open(SW_DIR + "AAREADME.text", "r") as readme_file:
    readme_lines = readme_file.read().split('\n')

  for i in range(17, len(readme_lines), 5):

    subdir = readme_lines[i]
    files = readme_lines[i+1:i+5]

    for file in files:
      swb_files.append(SW_DIR + subdir[:2] + "/" + subdir + "/" + file.strip())

  return swb_files

In [None]:
assert len(get_swb_file_list()) / 4 == 2438 # full, unchanged switchboard-1 dataset should pass this test

#### clean_transcript_swb
Takes the transcript, in special line objects, and cleans the text.
* Removes [laughter], [noise], and [vocalized-noise]
* Removes any other [laughter-BLANK] combination
* Converts word fragments into words. E.g., "I gue[ss]" -> "I guess"
* Removes all text between `<b_aside>` and `<e_aside>`
* Curly bracket words are treated as normal words. "{federaldes}" -> "federaldes"
* Capitalizes the start of each line.

In [None]:
# Adapted code originally written by Mai (s2324822)
def clean_transcript_swb(transcript_lines: list[Line]):

  clean_lines = []

  for line in transcript_lines:

    text = line.text

    # remove speech events
    clean = re.sub(r'\[laughter\]|\[noise\]|\[vocalized-noise\]', '', text)

    # check if further cleaning required
    if '[' in text:
      clean_tokens = []
      tokens = clean.split(' ')

      for token in tokens:

        # special laughter case
        if '[laughter-' in token:
          token = re.sub(r'\[laughter-', '', token)

        # partial completions
        if '-[' in token or ']-' in token:
          token = re.sub(r'\-\[|\]\-', '', token)
          
        token = re.sub(r'\[|\]', '', token)
        clean_tokens.append(token)

      clean = ' '.join(clean_tokens)

    # <aside> text
    if "<b_aside>" in clean:
      clean = re.sub(r'<b_aside>.*?<e_aside>', '', clean)

    # special pronounciation (eg. 'because_1')
    found = [i for i in range(len(clean)) if clean.startswith('_', i)]
    if len(found) > 0:
      f_indxs = [-2] + found + [len(clean)]
      clean = ''.join([clean[f_indxs[i-1]+2:f_indxs[i]] for i in range(1, len(f_indxs))])

    clean = re.sub(' +', ' ', clean.strip())

    # curly bracket text turned into normal words
    clean = re.sub('\{|\}', '', clean)

    # capitalize start of line for the ease of Llama 3
    clean = clean.capitalize()
    
    if clean != "": clean_lines.append(Line(line.speaker, line.id, line.start, line.end, clean))

  return clean_lines

#### get_speaker_lines
Switchboard transcribes each conversation into 2 separate files: speaker A and speaker B. The function `get_speaker_lines` converts one of these files into a list of `Line` objects.

In [None]:
def get_speaker_lines(speaker, dir):

  with open(dir, "r") as file:
    lines = file.read().split('\n')

  transcript_lines = []

  for line in lines:

    if line == "": continue

    # remove extra spaces
    if " "*6 in line:
      line = line.replace(" "*6, " ")
      line = line.replace(" "*5, " ")
    line = line.replace("	", " ")

    # convert into line object
    line_parts = line.split(" ", 3)
    if line_parts[3] == "[silence]": continue

    transcript_lines.append(Line(
      speaker,
      line_parts[0],
      float(line_parts[1]),
      float(line_parts[2]),
      line_parts[3]
    ))

  return transcript_lines

#### conversation_to_file
Takes a list of `Line` objects and writes it in text format into a file.
Optional settings:
* `speakers: bool` - include speaker prefixes for each line
* `text: bool` - include utterance text for each line
* `start: bool` - include utterance start time
* `end: bool` - include utterance end time
* `topic: bool` - include the conversation topic as a header for the transcript

In [None]:
def conversation_to_str(transcript, speakers=True, text=True, start=False, end=False, topic=False):

  str_lines = []

  if topic:
    str_lines.append(f"Topic: {transcript.topic}")

  for line in transcript.lines:

    line_str = ""

    if start:
      line_str += f"[{line.start:.2f} "

    if end:
      line_str += f"[{line.end:.2f}] "

    if speakers:
      line_str += line.speaker

    if text:
      line_str += line.text
    
    str_lines.append(line_str)
  
  return str_lines

#### swb_format_chat_template

Takes a list of strings of a conversation and formats it with a prompt in a style of a conversation between a user and AI-assistant. This prepares the data for training an instruction model, like Llama-3-8B-Instruct.

In [None]:
PROMPT = ["Write a phone conversation between two people. The first few lines of the conversation are:\n",
          "\n\nWrite 100 more lines."]

MESSAGES = [
  {
    "role": "system",
    "content": "You are a chatbot that writes text based on the user's input."
  },
  {
    "role": "user",
    "content": ""
  }
]

tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME_OR_DIR)

def swb_format_chat_template(transcript_lines): 

  global tokenizer
  
  first_four = transcript_lines[:4]
  continuation = transcript_lines[4:]

  MESSAGES[1]["content"] = PROMPT[0] + '\n'.join(first_four) + PROMPT[1]
  content = tokenizer.apply_chat_template(MESSAGES, tokenize=False, add_generation_prompt=True)
  content += "Here is the continuation of the conversation:\n\n" + '\n'.join(continuation) + "<|eot_id|>"

  return content.split('\n')

In [None]:
def str_lines_to_file(filename, str_lines):
  with open(filename, "w") as file:
    for line in str_lines:
      file.write(line + "\n")

#### main
Reads the filenames from `file-list.text`. Reads transcript files, cleans up the transcripts, and writes them to a directory with speakers and with no speakers.

In [None]:
def main():
  
  global get_swb_file_list, str_lines_to_file, swb_format_chat_template

  file_list = get_swb_file_list()

  for i in range(0, len(file_list), 4):

    transcript_id = file_list[i].split("/")[-2]
    speaker_a_dir = file_list[i]
    # file_list[i+1] is the transcript word-by-word
    speaker_b_dir = file_list[i+2]
    # file_list[i+3] is the transcript word-by-word

    # get conversation transcript
    speaker_a_lines = get_speaker_lines(Speaker1, speaker_a_dir)
    speaker_b_lines = get_speaker_lines(Speaker2, speaker_b_dir)
    transcript = Transcript(transcript_id, speaker_a_lines + speaker_b_lines)
    transcript.lines.sort(key=lambda x: x.start)

    # clean transcript
    transcript.lines = clean_transcript_swb(transcript.lines)

    # write transcript with speakers
    str_lines_to_file(OUTPUT_WITH_SPEAKERS + transcript_id + ".text", conversation_to_str(transcript, speakers=True)) 

    # write transcript without speakers
    str_lines_to_file(OUTPUT_NO_SPEAKERS + transcript_id + ".text", conversation_to_str(transcript, speakers=False))

    # write transcript with instruction
    str_lines_to_file(OUTPUT_INSTRUCT + transcript_id + ".text", swb_format_chat_template(conversation_to_str(transcript, speakers=False)))
    

In [None]:
main()