### This is a helper notebook, to merge part results into overall results
The part results will not be uploaded. This is only for transparency or if similar mistakes happen in the future. 

In [None]:
import numpy as np
import pickle
import pandas as pd

In [None]:
import textwrap

def pprint(long_string, width = 150):
    # Use the textwrap module to pretty print the string
    pretty_string = textwrap.fill(long_string, width=width)
    print(pretty_string)


In [None]:
def get_data(path): 
        merged_df = pd.read_csv(path)
        strings = merged_df['String']
        str_lst = strings.values

        vocab = merged_df['Title'].values
        identifier = merged_df['identifier']
        identifier_vocab = pd.DataFrame({'ID': identifier, 'Vocab': vocab})
        identifier_vocab = identifier_vocab.set_index('Vocab')['ID'].to_dict()
        return merged_df, str_lst, vocab, identifier_vocab, identifier


merged_df, str_lst, vocab, identifier_vocab, identifier = get_data('data/merged_data_for_AI.csv')

### Merge corrected transcripts

In [None]:
#load all files

with open("data/punctuation_correction/corrected_transcripts_0_to_3000.pkl", "rb") as file:
    co_transcripts_0_to_3000 = pickle.load(file)

with open("data/punctuation_correction/corrected_transcripts_3000_to_5000.pkl", "rb") as file:
    co_transcripts_3000_to_5000 = pickle.load(file)

with open("data/punctuation_correction/corrected_transcripts_5000_to_14000.pkl", "rb") as file:
    co_transcripts_5000_to_14000 = pickle.load(file)

with open("data/punctuation_correction/corrected_transcripts_14000_to_21000.pkl", "rb") as file:
    co_transcripts_14000_to_21000 = pickle.load(file)

with open("data/punctuation_correction/corrected_transcripts_21000_to_34000.pkl", "rb") as file:
    co_transcripts_21000_to_34000 = pickle.load(file)

with open("data/punctuation_correction/corrected_transcripts_34000_to_36000.pkl", "rb") as file:
    co_transcripts_34000_to_36000 = pickle.load(file)        

with open("data/punctuation_correction/corrected_transcripts_36000_to_40000.pkl", "rb") as file:
    co_transcripts_36000_to_40000 = pickle.load(file)    

with open("data/punctuation_correction/corrected_transcripts_40000_to_57000.pkl", "rb") as file:
    co_transcripts_40000_to_57000 = pickle.load(file)    

with open("data/punctuation_correction/corrected_transcripts_55000_to_71000.pkl", "rb") as file:
    co_transcripts_55000_to_71000 = pickle.load(file)    

with open("data/punctuation_correction/corrected_transcripts_71000_to_71980.pkl", "rb") as file:
    co_transcripts_71000_to_71980 = pickle.load(file)    


In [None]:
#co_transcripts = co_transcripts1 +co_transcripts2 +co_transcripts3 + co_transcripts4

co_transcripts = co_transcripts_0_to_3000 + co_transcripts_3000_to_5000 \
                + co_transcripts_5000_to_14000 + co_transcripts_14000_to_21000 + co_transcripts_21000_to_34000 \
                + co_transcripts_34000_to_36000 + co_transcripts_36000_to_40000 + co_transcripts_40000_to_57000[:15000] \
                + co_transcripts_55000_to_71000 + co_transcripts_71000_to_71980 

In [None]:
with open("data/punctuation_correction/corrected_transcripts_all.pkl", "wb+") as file:
        pickle.dump(co_transcripts, file)

### Merge summaries 

In [None]:
#load all corrected transcript files

with open("data/punctuation_correction/summaries/sumy_summaries_0_to_2007.pkl", "rb") as file:
    sumy_summaries_0_to_2007 = pickle.load(file)


with open("data/punctuation_correction/summaries/sumy_summaries_2007_to_14000.pkl", "rb") as file:
    sumy_summaries_2007_to_14000 = pickle.load(file)


with open("data/punctuation_correction/summaries/sumy_summaries_12000_to_21000.pkl", "rb") as file:
    sumy_summaries_12000_to_21000 = pickle.load(file)

with open("data/punctuation_correction/summaries/sumy_summaries_21000_to_34000.pkl", "rb") as file:
    sumy_summaries_21000_to_34000 = pickle.load(file)


with open("data/punctuation_correction/summaries/sumy_summaries_34000_to_43000.pkl", "rb") as file:
    sumy_summaries_34000_to_43000 = pickle.load(file)


with open("data/punctuation_correction/summaries/sumy_summaries_40000_to_55000.pkl", "rb") as file:
    sumy_summaries_40000_to_55000 = pickle.load(file)


with open("data/punctuation_correction/summaries/sumy_summaries_55000_to_71000.pkl", "rb") as file:
    sumy_summaries_55000_to_71000 = pickle.load(file)


with open("data/punctuation_correction/summaries/sumy_summaries_71000_to_71980.pkl", "rb") as file:
    sumy_summaries_71000_to_71980 = pickle.load(file)


In [None]:
sumy_summaries_all = sumy_summaries_0_to_2007 + sumy_summaries_2007_to_14000[:-2000] + sumy_summaries_12000_to_21000 + sumy_summaries_21000_to_34000 \
                + sumy_summaries_34000_to_43000[:6000] + sumy_summaries_40000_to_55000 + sumy_summaries_55000_to_71000 + sumy_summaries_71000_to_71980

In [None]:
len(co_transcripts_5000_to_14000)

In [None]:
pprint(sumy_summaries_all[5500])

In [None]:
pprint(sumy_summaries_all[54500])

In [None]:
with open("data/punctuation_correction/summy_summaries_all", "wb+") as file:
    pickle.dump(sumy_summaries_all, file)

#### check order of summaries

In [None]:
pprint(sumy_summaries_all[5000])

In [None]:
pprint(co_transcripts[5000])

#### Unfortunally the summaries are in the wrong order, as the muliprocessing went wrong. (In the new function for creating summaries we fixxed the error.) The following cells are to fix the old results so we do not have to compute everything again

In [None]:
import hashlib

def hash_sentence(sentence):
    return hashlib.sha256(sentence.encode()).hexdigest()


In [None]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

import nltk
nltk.download("punkt")

def sentence_separator(text, tokenizer):
    sentences = tokenizer.to_sentences(text)
    return sentences

In [None]:
tokenizer = Tokenizer("de")


co_transcripts_hashes = []
for i, text in enumerate(co_transcripts):
    cu_hash_list = []
    if i % 10000 == 0:
        print(i)
    for j, sentence in enumerate(sentence_separator(text, tokenizer)):
        sentence_hash = hash_sentence(sentence)
        cu_hash_list.append(sentence_hash)
    co_transcripts_hashes.append(cu_hash_list)


In [None]:
summaries_hashes = []
for i, summary in enumerate(sumy_summaries_all):
    cu_hash_list = []
    for j, sentence in enumerate(sentence_separator(summary, tokenizer)):
        sentence_hash = hash_sentence(sentence)
        cu_hash_list.append(sentence_hash)
    summaries_hashes.append(cu_hash_list)


In [None]:
def count_matches(sum_list, transcript_list):
    counter = 0
    for sum_hash in sum_list:
        if sum_hash in transcript_list:
            counter +=1 
    return counter
    


In [None]:
def find_match(hashed_summary, co_transcripts_hashes, sindex, max_diff = 50):
    num_sentences = len(hashed_summary)
    #print(count_matches(hashed_summary, co_transcripts_hashes[sindex]))
    if count_matches(hashed_summary, co_transcripts_hashes[sindex]) == num_sentences:
         return sindex
    for index_diff in range(max_diff):
        if sindex - index_diff >= 0:
            if count_matches(hashed_summary, co_transcripts_hashes[sindex-index_diff]) == num_sentences:
                return sindex - index_diff
        if sindex + index_diff < len(co_transcripts_hashes):
            if count_matches(hashed_summary, co_transcripts_hashes[sindex+index_diff]) == num_sentences:
                return sindex + index_diff
    return np.nan


In [None]:
import numpy as np

matching_indices = {}
for sindex, summary in enumerate(summaries_hashes):
    if sindex % 10000 == 0:
        print(sindex)
    #check if already right position

    match_index = find_match(summary, co_transcripts_hashes, sindex)
    matching_indices[sindex] = match_index


### Check for summaries that either do not have any matching summary or match to more than one summary

In [None]:
value_to_key = {}

for i in matching_indices.keys():
    if matching_indices[i] not in value_to_key:
        value_to_key[matching_indices[i]] = [i]
    else:
        value_to_key[matching_indices[i]].append(i)


In [None]:
not_matched = []
matched_more_than_once = []
already_matched_result_dict = {}
for i in range(0, len(co_transcripts)):
    if i not in value_to_key:
        not_matched.append(i)
    elif len(value_to_key[i]) > 1:
        matched_more_than_once.append(i)
    elif len(value_to_key[i]) == 1:
        if i in already_matched_result_dict:
            print("warning! ", i )
        already_matched_result_dict[i] = value_to_key[i][0]
    else:    
        print(i)


In [None]:
print("correctly matched: ", len(already_matched_result_dict) , "of ", len(co_transcripts))
print("matched more than once: ", len(matched_more_than_once))
print("not matched at all: ", len(not_matched))


### for all transcripts we did not find a matching summary, we summarize them again

In [None]:
import multiprocessing
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.utils import get_stop_words

def summarize_text(index, text):
    parser = PlaintextParser.from_string(text, Tokenizer("de"))
    summarizer = LexRankSummarizer()
    summarizer.stop_words = get_stop_words("de")

    summary = summarizer(parser.document, 10)  # Summarize into 10 sentences
    summary_text = " ".join(str(sentence) for sentence in summary)

    return index, summary_text
    
def summarize_texts_with_multiprocessing(texts, indices):
    summaries = []
    processed_count = 0

    def update_progress(index):
        nonlocal processed_count
        processed_count += 1
        if processed_count % 200 == 0:
            print("Processed index:", index)

    with multiprocessing.Pool() as pool:
        results = []
        for index in indices:
            results.append(pool.apply_async(summarize_text, args=(index, texts[index]), callback=update_progress))

        for result in results:
            index, summary_text = result.get()
            summaries.append({"index": index, "summary": summary_text})

    return summaries



In [None]:
matched_more_than_once_results = summarize_texts_with_multiprocessing(co_transcripts, matched_more_than_once)

In [None]:
#this takes a few hours, as this seem to be the longest transcripts (about 4-6 times as long as the average)
not_matched_results = summarize_texts_with_multiprocessing(co_transcripts, not_matched)

In [None]:
#save to disk
'''

with open("data/punctuation_correction/summaries/not_matched_results.pkl", "wb+") as file:
    pickle.dump(not_matched_results, file)

with open("data/punctuation_correction/summaries/matched_more_than_once_results.pkl", "wb+") as file:
    pickle.dump(matched_more_than_once_results, file)
'''

In [None]:
#load from disk

with open("data/punctuation_correction/summaries/not_matched_results.pkl", "rb") as file:
  not_matched_results = pickle.load(file)

with open("data/punctuation_correction/summaries/matched_more_than_once_results.pkl", "rb") as file:
  matched_more_than_once_results = pickle.load(file)

### merge all summaries together in the correct order

In [None]:
not_matched_results_dict = {}

for k in not_matched_results: 
    if k["index"] in not_matched_results_dict:
        print("warning! ", k)
    else: 
        not_matched_results_dict[k["index"]] = k["summary"]

In [None]:
matched_more_than_once_results_dict = {}

for k in matched_more_than_once_results: 
    if k["index"] in matched_more_than_once_results_dict:
        print("warning! ", k)
    else: 
        matched_more_than_once_results_dict[k["index"]] = k["summary"]

In [None]:
cleaned_summaries = {}
for k in range(len(co_transcripts)):
    if np.isnan(k):
        continue
    if k in already_matched_result_dict:
        cleaned_summaries[k] = sumy_summaries_all[value_to_key[k][0]]
    else:
        if k in not_matched_results_dict:
            cleaned_summaries[k] = not_matched_results_dict[k]
        else:
            cleaned_summaries[k] = matched_more_than_once_results_dict[k]

In [None]:
final_sumy_summaries = []
for k in range(len(co_transcripts)):
    final_sumy_summaries.append(cleaned_summaries[k])
        

In [None]:
len(final_sumy_summaries)

### inspect a few example to validate, that the order is correct

In [None]:
already_matched_result_dict.keys()
#not_matched 
#matched_more_than_once

In [None]:
ex_index = 29722

In [None]:
pprint(final_sumy_summaries[ex_index])

In [None]:
pprint(co_transcripts[ex_index])

### save final summaries to disk

In [None]:
with open("data/punctuation_correction/summaries/sumy_summaries_cleaned.pkl", "wb+") as file:
    pickle.dump(final_sumy_summaries, file)