In [None]:
# imports
import json
import os
import pandas as pd
from tqdm import tqdm
import sys
import numpy as np
from transformers import pipeline
import datetime
import random
import math

In [None]:
model_name = "facebook/bart-large-cnn"
summarizer = pipeline(
    "summarization",
    model=model_name,
    tokenizer=model_name,
    device="cuda:0",
)

In [None]:
# get the list of files in the transcripts folder
DIR_FOLDER = "raw_dataset_v3"
OUTPUT_FOLDER = "dataset_v3"
transcripts = os.listdir(DIR_FOLDER)

# if output folder doesn't exist, create it
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

In [None]:
def summarize_transcript(data):
    subset_texts = []
    text = ""
    start_time = data["segments"][0]["start"]
    idx = 0

    while idx < len(data["segments"]):
        segment = data["segments"][idx]
        segment_start_time = segment["start"]
        segment_end_time = segment["end"]
        segment_text = segment["text"]

        # add segment_text to text
        text += " " + segment_text.strip() if text != "" else segment_text.strip()

        # check if text is longer
        if len(text) > 1500:
            subset_texts.append([start_time, segment_end_time, text])
            start_time = segment_end_time
            text = ""
            idx -= 5
        idx += 1
    
    # add the last text
    subset_texts.append([start_time, segment_end_time, text])

    #print("Starting summarization")
    summary_dict = summarizer(
        [subset[2] for subset in subset_texts],
        max_length=100,
        min_length=0,
        do_sample=True,
    )

    return summary_dict, subset_texts

In [None]:
def convert_timestamps_to_seconds(timestamps):
    # Iterate through each time stamp, check if it's in format H#M#S# or M#S# and convert it to seconds
    for i, timestamp in enumerate(timestamps):
        timestamp = timestamp.split(":")
        # add the time to the timestamp
        timestamps[i] = sum([int(t)*(60**i) for i, t in enumerate(timestamp[::-1])])

    return timestamps

In [None]:
import random
random.shuffle(transcripts)

for transcript in tqdm(transcripts):
    subset_texts = []
    #print(f"Processing {transcript.replace('.json', '')}")
    # open the json file
    try:
        with open(f"{DIR_FOLDER}/{transcript}", "r") as f:
            data = json.load(f)
    except:
        continue

    # if the transcript already is in either OUTPUT_FOLDER
    files = os.listdir(OUTPUT_FOLDER)
    # check if transcript name is in any file, if so, skip
    if any(transcript.replace(".json", "") in file for file in files):
        continue

    exists_timestamps = False if len(data["timestamps"]) == 0 else True

    if not exists_timestamps:
        continue

    summary_dict, subset_texts = summarize_transcript(data)
    summarized_text_string = "Given summarizations we want to know the keyword for each summary in the list.\n\n"
    label_string = ""
    dataset = []
    json_idx = 0

    time_stamps = convert_timestamps_to_seconds(data['timestamps'])
    keywords = data['keywords']
    if time_stamps[0] != 0:
        time_stamps.insert(0, 0)
        keywords.insert(0, "Introduction")

    for idx, ((start, end, t), summary_d) in enumerate(zip(subset_texts, summary_dict)):
        # we want to get the first index where start_time is greater than timestamps[i]
        i = np.where([start>=tim for tim in time_stamps])[0].max()
        keyword = keywords[i]
        
        keyword = keyword.strip().strip("|").strip()
        summary = summary_d["summary_text"]
        writing_idx = idx % 10
        summarized_text_string += f"{writing_idx + 1}. {summary}\n"
        label_string += f"{writing_idx + 1}. {keyword}\n"

        if (idx + 1) % 10 == 0 or idx == len(summary_dict) - 1:
            summarized_text_string += f"\n\n###\n\nKEYWORDS:\n"
            dataset.append({"prompt": summarized_text_string, "completion": label_string})

           
            with open(f"{OUTPUT_FOLDER}/{json_idx}_{transcript}", "w") as f:
                json.dump(
                    {
                        "prompt": summarized_text_string,
                        "completion": label_string,
                    },
                    f,
                )
            json_idx += 1
            summarized_text_string = "Given summarizations we want to know the keyword for each summary in the list.\n\n"
            label_string = ""