In [1]:
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch

from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import yaml
from box import Box
import os
import time

from utils.eval_similarity import *
from utils.utils import *
from utils.segment_embedding import *
from utils.concat_functions import *
from utils.summarizer import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load config ----------------------
with open("config.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
    config = Box(config)

print(config.experiment_name)

segment-length-100


In [3]:
# load data ----------------------
open_source_datasets = load_dataset(config.data.open_source)
youtube_datasets = load_dataset(config.data.youtube)

print(open_source_datasets, youtube_datasets)

DatasetDict({
    train: Dataset({
        features: ['report', 'summary'],
        num_rows: 17517
    })
    validation: Dataset({
        features: ['report', 'summary'],
        num_rows: 973
    })
    test: Dataset({
        features: ['report', 'summary'],
        num_rows: 973
    })
}) DatasetDict({
    train: Dataset({
        features: ['playlist', 'file_name', 'content'],
        num_rows: 5121
    })
})


In [4]:
# based on the dataset, change the key to access the data
open_source_datasets = open_source_datasets["train"]["report"]
youtube_datasets  = youtube_datasets["train"]["content"]

# *** DO NOT PRINT THE WHOLE DATASET ipynb will die ***
print(len(open_source_datasets), len(youtube_datasets))

17517 5121


### IMPORTANT: SELECT DATA FOR EXPERIMENT 

In [5]:
# select data for experiment ----------------------
experiment_texts = open_source_datasets[:1]
# experiment_texts = youtube_datasets[:100]

print(len(experiment_texts))

1


In [None]:
# run experiment ----------------------
summaries = []
results = []

for i in range(len(experiment_texts)):
    print(f"==== Processing {i+1}/{len(experiment_texts)} ====")
    
    start_time = time.time()
    texts = experiment_texts[i]
    # print(texts)

    # texts to segments
    print("Segmentating...")
    segments = segmentate_sentence(texts, config.segment.n_word, config.segment.n_overlap, True)
    embeddings = encode_segments(segments)

    # concatenate segements
    print("Concatenating...")
    concatenated_indexes = concate_time_based(embeddings,threshold=0.6 )
    max_length = max([len(group) for group in concatenated_indexes])
    print(f"theme_num: {len(concatenated_indexes)}, max_size: {max_length}")

    # based on concatenated indexes make theme segements
    theme_segments = [
        " ".join(segments[j] for j in group) for group in concatenated_indexes
    ]

    # make summary
    print("Summarizing...")
    summary = summarizer(
        theme_segments, 
        model=config.summary.model, 
        max_length=config.summary.max_length,
        min_length=config.summary.min_length
    )
    summaries.append(summary)
    # print(summary)

    # evaluation ----------------------
    # TODO: evaluation code here
    # results.append(evaluation(summary, texts))

    end_time = time.time()
    print(f"Time taken: {end_time - start_time}")

==== Processing 1/1 ====
Segmentating...
Concatenating...
theme_num: 14, max_size: 31
Summarizing...
The structure of the armed forces is based on the Total Force concept. In recent years, federal civilian personnel have deployed along with military personnel to participate in Operations Joint Endeavor, in Bosnia-Herzegovina, Croatia, and Hungary. Further, since the beginning of the Global War on Terrorism, the role of DOD’sFederal civilian personnel has expanded to include participation in the wars in Iraq and Afghanistan. The U.S. military and civilian personnel, and DOD federal civilian employees, contribute to national defense. DOD relies on federal civilian personnel it deploys to support a range of essential missions. These missions include intelligence collection, criminal investigations, and weapon systems acquisition and maintenance. To ensure that its federal civilian employees will deploy to combat zones and perform critical combat support functions in theater, DOD establish

In [7]:
print(experiment_texts[0])
print(summaries[0])

The structure of the armed forces is based on the Total Force concept, which recognizes that all elements of the structure—active duty military personnel, reservists, defense contractors, host nation military and civilian personnel, and DOD federal civilian employees—contribute to national defense. In recent years, federal civilian personnel have deployed along with military personnel to participate in Operations Joint Endeavor, conducted in the countries of Bosnia-Herzegovina, Croatia, and Hungary; Joint Guardian, in Kosovo; and Desert Storm, in Southwest Asia. Further, since the beginning of the Global War on Terrorism, the role of DOD’s federal civilian personnel has expanded to include participation in combat support functions in Operations Enduring Freedom and Iraqi Freedom. DOD relies on the federal civilian personnel it deploys to support a range of essential missions, including intelligence collection, criminal investigations, and weapon systems acquisition and maintenance. To 

In [None]:
# save config & results ----------------------
exp_dir = os.path.join(config.experiment_name, config.exp_name)
os.makedirs(exp_dir, exist_ok=True)
os.system(f"cp config.yaml {exp_dir}")

# TODO: save experiment results
# *** important : please save the number of data you used ***
# num_data = len(experiment_texts)