In [1]:
import os 
import re

In [31]:
def clean_transcript(text):
  """Removes filler words, text within parentheses, text within angle brackets, and punctuation."""
  
  text = text.lower()
  
  filler_words = ['um', 'uh', 'like', 'you know', 'basically', 'so', 'like', 'actually', 'pat', 'oh', 'er', 'em', 'erm', 'a', 'an', 'the', 'these']
  pattern_fillers = r'\b(' + '|'.join(filler_words) + r')\b'
  pattern_parentheses = r'\([^)]*\)'
  pattern_angle_brackets = r'<[^>]*>'
  pattern_punctuation = r'[^\w\s]'

  text = re.sub(pattern_fillers, '', text)
  text = re.sub(pattern_parentheses, '', text)
  text = re.sub(pattern_angle_brackets, '', text)
  text = re.sub(pattern_punctuation, '', text) 

  return text

In [29]:
def process_folder(input_dir, output_dir):
  """Processes the given input directory and saves cleaned transcripts to the output directory."""
  if not os.path.exists(output_dir):
    os.makedirs(output_dir)

  for participant_dir in os.listdir(input_dir):
    participant_path = os.path.join(input_dir, participant_dir)
    if os.path.isdir(participant_path):
      for filename in os.listdir(participant_path):
        if filename.endswith("_SFT.txt"):
          filepath = os.path.join(participant_path, filename)
          with open(filepath, 'r') as f:
            text = f.read()
          cleaned_text = clean_transcript(text)

          output_filepath = os.path.join(output_dir, f"{participant_dir}.txt")
          with open(output_filepath, 'w') as f:
            f.write(cleaned_text)

In [32]:
# Example usage:
input_dir = "/home/fritzpeters/Documents/speech_graph/PROCESS-V1" 
output_dir = "/home/fritzpeters/Documents/speech_graph/SFT_transcripts"

process_folder(input_dir, output_dir)