In [None]:
import json
import os


def chunk_transcript_with_timing(transcript_data, chunk_duration=300):
    chunks = []
    current_chunk = []
    current_chunk_start_time = 0
    current_chunk_end_time = 0

    # Loop through each word and its timing
    for word_info in transcript_data['results']['channels'][0]['alternatives'][0]['words']:
        word = word_info['punctuated_word']
        start_time = word_info['start']
        end_time = word_info['end']

        # If adding this word exceeds the chunk duration, save the current chunk and start a new one
        if current_chunk_end_time - current_chunk_start_time >= chunk_duration:
            chunks.append({
                "chunk": " ".join(current_chunk),
                "start_time": current_chunk_start_time,
                "end_time": current_chunk_end_time
            })
            current_chunk = []
            current_chunk_start_time = start_time

        # Add the word to the current chunk
        current_chunk.append(word)
        current_chunk_end_time = end_time

    # Add the last chunk
    if current_chunk:
        chunks.append({
            "chunk": " ".join(current_chunk),
            "start_time": current_chunk_start_time,
            "end_time": current_chunk_end_time
        })

    return chunks



def access_file(folder, chunk_folder):
    lst = []
    for subfolder in os.listdir(folder):
        subfolder_path = os.path.join(folder, subfolder)

        if os.path.isdir(subfolder_path):
            files = os.listdir(subfolder_path)

            transcript_file = os.path.join(subfolder_path, 'transcript.json')
            metadata_file = os.path.join(subfolder_path, 'metadata.json')

            with open(transcript_file, 'r') as file1:
                transcript_data = json.load(file1)

            with open(metadata_file, 'r') as file2:
                metadata_data = json.load(file2)
                title = metadata_data['title']

            # Chunk the transcript
            chunks = chunk_transcript_with_timing(transcript_data)

            #saving it into a json file
            ouptut_file_path = os.path.join(chunk_folder, f"{title}.json")
            with open(ouptut_file_path, 'w') as output_file:
                json.dump(chunks, output_file, indent= 4)



#path to folder where vidoes transcripts are available
folder = '/content/drive/MyDrive/test'
#path to the folder where you want to save the chunks
chunk_folder = '/content/drive/MyDrive/chunks'
access_file(folder, chunk_folder)

