In [1]:
import re
import csv
import os
import webvtt

# Function to parse VTT file
def parse_vtt(file_path):
    segments = []
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.readlines()
        speaker, time, utterance = None, None, []
        
        for line in content:
            line = line.strip()
            # Match time format (e.g., 00:01:16.000 --> 00:01:18.000)
            time_match = re.match(r'\d{2}:\d{2}:\d{2}', line)
            if time_match:
                if speaker and utterance:
                    segments.append((speaker, time, ' '.join(utterance)))
                    utterance = []
                time = line
            # Match speaker identifier (e.g., SPK_1)
            elif re.match(r'SPK_\d+', line):
                if speaker and utterance:
                    segments.append((speaker, time, ' '.join(utterance)))
                    utterance = []
                speaker = line
            # Collect utterances
            elif line:
                utterance.append(line)
        # Append the last segment
        if speaker and utterance:
            segments.append((speaker, time, ' '.join(utterance)))
    return segments

# Function to segment the dialogue into different topics
def segment_dialogue(segments):
    segmented_dialogues = []
    current_segment = []
    
    for speaker, time, utterance in segments:
        # If the utterance contains a question or indicates a topic shift, start a new segment
        if re.search(r'\b(why|what|how|when|do you|let\'s talk about)\b', utterance, re.IGNORECASE):
            if current_segment:
                segmented_dialogues.append(current_segment)
                current_segment = []
        current_segment.append((speaker, time, utterance))
    if current_segment:
        segmented_dialogues.append(current_segment)
    
    return segmented_dialogues


# Function to write segments to CSV
def write_segments_to_csv(segments, output_folder, base_filename):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for idx, segment in enumerate(segments):
        output_file = os.path.join(output_folder, f"{base_filename}_segment_{idx+1}.csv")
        with open(output_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['Speaker', 'Time', 'Utterance'])
            for speaker, time, utterance in segment:
                writer.writerow([speaker, time, utterance])

# Main processing function
def process_vtt_files(file_paths, output_folder='segmented_dialogues'):
    for file_path in file_paths:
        base_filename = os.path.splitext(os.path.basename(file_path))[0]
        segments = parse_vtt(file_path)
        segmented_dialogues = segment_dialogue(segments)
        write_segments_to_csv(segmented_dialogues, output_folder, base_filename)

# List of VTT files to process
vtt_files = [
    '/mnt/data/WithMod1.vtt',
    '/mnt/data/WithMod2.vtt',
    '/mnt/data/WithMod3.vtt',
    '/mnt/data/WithMod4.vtt',
    '/mnt/data/WithMod5.vtt',
    '/mnt/data/WithoutMod1.txt',
    '/mnt/data/WithoutMod2.txt',
    '/mnt/data/WithoutMod3.vtt',
    '/mnt/data/WithoutMod4.vtt',
    '/mnt/data/WithoutMod5.vtt'
]

# Run the processing
process_vtt_files(vtt_files)


ModuleNotFoundError: No module named 'webvtt'

In [4]:
import re
import csv

file = '/Users/gaowei/Desktop/UK\ Visit\ Projects🇬🇧/L2Moderator/Unmergered_data/WithMod1.vtt '
opened_file = open(file,encoding='utf8')
content = opened_file.read()
segments = content.split('\n\n') # split on double line

# wrangle segments
m = re.compile(r"\<.*?\>") # strip/remove unwanted tags
o = re.compile(r"\.+\d+") # strip/remove miliseconds

def clean(content):
    new_content = m.sub('',content)
    new_content = o.sub('',new_content)
    new_content = new_content.replace('align:start position:0%','')
    new_content = new_content.replace('-->','')
    return new_content

new_segments = [clean(s) for s in segments if len(s)!=0][2:]

# trim time codes for g suite plain text formatting conversion to seconds w/ formula '=value(str*24*3600)'
def clean_time(time):
    time = time.split(':')
    if time[0]=='00':
        return time[1]+':'+time[2]
    if not time[0]=='00':
        return time[0]+':'+time[1]+':'+time[2]

trimmed_segments = []
for segment in new_segments:
    split_segment = segment.split()
    time_code = split_segment[0]
    text = ' '.join(segment.split()[2:])
    trimmed_segment = (time_code, text)
    trimmed_segments.append(trimmed_segment)

# write output as csv file
with open(str(file)[:-3]+'csv', 'w', encoding='utf8', newline='') as f:
    for line in trimmed_segments:
        thewriter = csv.writer(f)
        thewriter.writerow(line)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/gaowei/Desktop/UK\\ Visit\\ Projects🇬🇧/L2Moderator/Unmergered_data/WithMod1.vtt '

In [5]:
filenames_vtt = [os.fsdecode(file) for file in os.listdir(os.getcwd()) if os.fsdecode(file).endswith(".vtt")]

#Check file names
filenames_vtt[:2]

# Then, we write a function to extract the information and store it.
import webvtt
def convert_vtt(filenames):    
    #create an assets folder if one does not yet exist
    if os.path.isdir('{}/assets'.format(os.getcwd())) == False:
        os.makedirs('assets')
    #extract the text and times from the vtt file
    for file in filenames:
        captions = webvtt.read(file)
        text_time = pd.DataFrame()
        text_time['text'] = [caption.text for caption in captions]
        text_time['start'] = [caption.start for caption in captions]
        text_time['stop'] = [caption.end for caption in captions]
        text_time.to_csv('assets/{}.csv'.format(file[:-4]),index=False) #-4 to remove '.vtt'
        #remove files from local drive
        os.remove(file)

ModuleNotFoundError: No module named 'webvtt'

### Segemnation dialogues