In [1]:
import os 
import re

In [None]:
def extract_transcript(text):
    """Removes lines starting with 'Oth:', filler words, text within parentheses, text within angle brackets, and punctuation."""
    
    pattern_oth_lines = r'^Oth:.*?$'  # Matches lines starting with 'Oth:'
    text = re.sub(pattern_oth_lines, '', text, flags=re.MULTILINE)

    text = text.lower()
    text = text.replace('\n', ' ')  # Remove newlines

    filler_words = ['pat']
    pattern_fillers = r'\b(' + '|'.join(filler_words) + r')\b'
    pattern_parentheses = r'\([^)]*\)'
    pattern_angle_brackets = r'<[^>]*>'
    pattern_punctuation = r'[^\w\s]'

    text = re.sub(pattern_fillers, '', text)
    text = re.sub(pattern_parentheses, '', text)
    text = re.sub(pattern_angle_brackets, '', text)
    text = re.sub(pattern_punctuation, '', text) 

    return text

In [None]:
def process_folder(input_dir, output_dir, file_ending):
  """Processes the given input directory and saves cleaned transcripts to the output directory."""
  if not os.path.exists(output_dir):
    os.makedirs(output_dir)

  for participant_dir in os.listdir(input_dir):
    participant_path = os.path.join(input_dir, participant_dir)
    if os.path.isdir(participant_path):
      for filename in os.listdir(participant_path):
        if filename.endswith(file_ending):
          filepath = os.path.join(participant_path, filename)
          with open(filepath, 'r') as f:
            text = f.read()
          cleaned_text = extract_transcript(text)

          output_filepath = os.path.join(output_dir, f"{participant_dir}.txt")
          with open(output_filepath, 'w') as f:
            f.write(cleaned_text)

In [None]:
input_dir = "/home/fritzpeters/Documents/speech_graph/PROCESS-V1" 
output_dir_SFT = "/home/fritzpeters/Documents/speech_graph/transcripts_outputFiles/SFT_outputFiles"
output_dir_PFT = "/home/fritzpeters/Documents/speech_graph/transcripts_outputFiles/PFT_outputFiles"
output_dir_CTD = "/home/fritzpeters/Documents/speech_graph/transcripts_outputFiles/CTD_outputFiles"

process_folder(input_dir, output_dir_SFT, 'SFT.txt')
process_folder(input_dir, output_dir_PFT, 'PFT.txt')
process_folder(input_dir, output_dir_CTD, 'CTD.txt')

In [None]:
# Function to extract and clean (i.e., remove filler words, etc.) the target transcript 
# we have not used this yet as we did not want to exclude filler words/disfluencies

def clean_transcript(text):
  """Removes filler words, text within parentheses, text within angle brackets, and punctuation."""

  pattern_oth_lines = r'^Oth:.*?$'  # Matches lines starting with 'Oth:'
  text = re.sub(pattern_oth_lines, '', text, flags=re.MULTILINE)
  
  text = text.lower()
  text = text.replace('\n', ' ')  # Remove newlines
  
  filler_words = ['um', 'uh', 'like', 'you know', 'basically', 'so', 'like', 'actually', 'pat', 'oh', 'er', 'em', 'erm', 'a', 'an', 'the', 'these']
  pattern_fillers = r'\b(' + '|'.join(filler_words) + r')\b'
  pattern_parentheses = r'\([^)]*\)'
  pattern_angle_brackets = r'<[^>]*>'
  pattern_punctuation = r'[^\w\s]'

  text = re.sub(pattern_fillers, '', text)
  text = re.sub(pattern_parentheses, '', text)
  text = re.sub(pattern_angle_brackets, '', text)
  text = re.sub(pattern_punctuation, '', text) 

  return text