## Notebook to create summaries using sumy

In [None]:
import numpy as np
import pickle
import pandas as pd

In [None]:
with open("data/punctuation_correction/corrected_transcripts_all.pkl", "rb") as file:
    co_transcripts = pickle.load(file)


In [None]:
import multiprocessing
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.utils import get_stop_words

def summarize_text(index, text):
    parser = PlaintextParser.from_string(text, Tokenizer("de"))
    summarizer = LexRankSummarizer()
    summarizer.stop_words = get_stop_words("de")

    summary = summarizer(parser.document, 10)  # Summarize into 10 sentences
    summary_text = " ".join(str(sentence) for sentence in summary)

    return index, summary_text
    
def summarize_texts_with_multiprocessing(texts):
    #texts is a list of the texts we want to create a summary for
    summaries = []
    processed_count = 0
    index = 0
    def update_progress(index):
        nonlocal processed_count
        processed_count += 1
        if processed_count % 200 == 0:
            print("Processed index:", index)
            with open("data/punctuation_correction/summaries/summaries_tmp.pkl", "wb+") as file:
                pickle.dump(summaries,file)

    with multiprocessing.Pool() as pool:
        results = []
        for index, text in enumerate(texts):
            results.append(pool.apply_async(summarize_text, args=(index, text), callback=update_progress))

        for result in results:
            index, summary_text = result.get()
            summaries.append({"index": index, "summary": summary_text})

    return summaries

In [None]:
#create summaries 
summaries = summarize_texts_with_multiprocessing(co_transcripts)

In [None]:
with open("data/punctuation_correction/summaries/summaries.pkl", "wb+") as file:
                pickle.dump(summaries,file)