<a href="https://colab.research.google.com/github/SiWorgan/dl_sandbox/blob/master/Copy_of_Summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json, datetime
import pandas as pd

from transformers import T5Tokenizer
from transformers import BartTokenizer
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

from pynvml import *

def print_gpu_utilization():
  nvmlInit()
  handle = nvmlDeviceGetHandleByIndex(0)
  info = nvmlDeviceGetMemoryInfo(handle)
  print(f"GPU memory occupied: {info.used//1024**2} MB.")

hf_name = 'pszemraj/led-large-book-summary'

_model = AutoModelForSeq2SeqLM.from_pretrained(
    hf_name,
    low_cpu_mem_usage=True,
)

_tokenizer = AutoTokenizer.from_pretrained(
    hf_name
)

summarizer = pipeline(
    "summarization",
    model=_model,
    tokenizer=_tokenizer,
    device=0
)

full_transcript = "TRANSCRIPT!"

summary = summarizer(transcript,
                     min_length=16,
                     max_length=256,
                     no_repeat_ngram_size=3,
                     encoder_no_repeat_ngram_size=3,
                     clean_up_tokenization_spaces=True,
                     repetition_penalty=3.7,
                     num_beams=4,
                     early_stopping=True)

csv_transcript = "CSV_TRANSCRIPT!"

def nest_sentences(document):
  tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
  chunks = {"time": [], "chunk": []}
  length = 0
  chunk = ""
  for time, turn in zip(document["time"], document["content"]):
    length += len(tokenizer(turn, max_length=None, return_tensors='pt', truncation=False)['input_ids'][0])
    if length < 1024:
      chunk += " " + turn
    else:
      chunks["chunk"].append(chunk)
      chunks["time"].append(time)
      chunk = turn
      length = len(tokenizer(turn, max_length=None, return_tensors='pt', truncation=False)['input_ids'][0])

    chunks["chunk"].append(chunk)
    chunks["time"].append(time)

    return chunks

def generate_summary(nested_sentences):
  section_summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY", device=0)
  summaries = {"time":[], "summary": []}
  for time, chunk in zip(nested_sentences["time"], nested_sentences["chunk"]):
    summaries["time"].append(time)
    summaries["summary"].append(section_summarizer(chunk)[0]['summary_text'])

  return summaries

nested_sentences = nest_sentences(csv_transcript)
summaries = generate_summary(nested_sentences)

df = pd.DataFrame(summaries)
df.to_csv(f'summary_output.csv', index=False)

  