In [None]:
import csv
import json
import re
import os

def read_csv(csv_filename):
    with open(csv_filename, 'r') as csv_file:
        csv_reader = csv.reader(csv_file)
        data = [row for row in csv_reader]
    return data

def read_json(json_filename):
    with open(json_filename, 'r') as json_file:
        data = json.load(json_file)

    # Extracting the transcript from the JSON data
    transcript = data.get("transcript", "")
    return transcript

def calculate_duration_offset(csv_data, transcript, i):
    lines = []
    aligned_lines = []  # Store lines that are successfully aligned
    # Split the transcript into lines
    transcript_lines = [line.strip() for line in transcript.split('\n') if line]
    csv_index = 1  # Skip the header row in the CSV file
    for line in transcript_lines:
        line = line.replace("-", " ")
        line = re.sub(r'[^\w\s]', '', line)
        #line = line.replace("’", '').replace('"', '')
        current_sentence = ""
        words = line.split()
        sentence_start_time = None
        sentence_end_time = None  # Initialize sentence_end_time for each line
        for word in words:
            lowercase_word = word.lower()
            # Iterate through the remaining rows in the CSV data, including the header
            while csv_index < len(csv_data):
                csv_word = csv_data[csv_index][0].replace("’", '').replace('"', '')
                if csv_word.lower() == lowercase_word:
                    try:
                        start_time = float(csv_data[csv_index][2])
                        end_time = float(csv_data[csv_index][3])
                    except ValueError:
                        csv_index += 1
                        break

                    # Skip rows with non-numeric start or end time
                    current_sentence += word + " "
                    # Store the start and end time of the current word
                    if sentence_start_time is None or start_time < sentence_start_time:
                        sentence_start_time = start_time
                    if sentence_end_time is None or end_time > sentence_end_time:
                        sentence_end_time = end_time

                    # Remove the recognized word from the CSV data
                    csv_data.pop(csv_index)

                    # Decrement csv_index to stay in the same position after removal
                    csv_index -= 1

                    # Increment csv_index for the next iteration
                    csv_index += 1
                    break  # Break out of the while loop once a match is found
                else:
                    csv_index += 1
            # If no match is found for the current word, reset csv_index to one
            if csv_index == len(csv_data):
                csv_index = 1

        # Calculate duration for the entire sentence
        if sentence_start_time is not None and sentence_end_time is not None:
            duration = abs(sentence_end_time - sentence_start_time)
            lines.append({
                'duration': duration,
                'offset': sentence_start_time,  # Use the start time as offset
                'sentence': current_sentence.strip()  # Remove trailing whitespace
            })
            aligned_lines.append(line)  # Store successfully aligned lines

    # Write the aligned lines to a separate file
    aligned_filename = f'/Users/saanvinair/Desktop/TEDtxt/{i}.txt'
    with open(aligned_filename, 'w') as aligned_file:
        for aligned_line in aligned_lines:
            aligned_file.write(f"{aligned_line}\n")

    return lines

def main():
    for i in range(0, 873):  # Assuming you have directories from 4 to 9
        csv_filename = f'/Users/saanvinair/Downloads/TEDcsv/output_{i}.csv'
        json_filename = f'/Users/saanvinair/Desktop/TEDjson/alignment_result_{i}.json'
        output_filename = f'/Users/saanvinair/Desktop/TEDyaml/{i}.txt'

        # Check if CSV and JSON files exist
        if not (os.path.exists(csv_filename) and os.path.exists(json_filename)):
            print(f"CSV or JSON file not found for file number {i}. Skipping.")
            continue

        csv_data = read_csv(csv_filename)
        transcript = read_json(json_filename)

        calculated_data = calculate_duration_offset(csv_data, transcript, i)

        # Write the output data to a text file
        with open(output_filename, 'w') as output_file:
            for line in calculated_data:
                output_file.write(
                    f"- {{duration: {line['duration']}, offset: {line['offset']}, speaker_id: spk.{i}, wav: hi{i}.wav}}\n")

if __name__ == "__main__":
    main()
