In [1]:
import math

import keybert
import pandas as pd
from transformers import pipeline

In [2]:
# Initialization of summarizer based on Bart
summarizer = pipeline(
    "summarization", "vmarklynn/bart-large-cnn-samsum-acsi-ami-v2", truncation=True
)
kw_model = keybert.KeyBERT(model="all-mpnet-base-v2")

In [3]:
def formatText(text):
    formatted_data = [
        f"{row['speaker_label']}: {row['text']}" for _, row in text.iterrows()
    ]
    formatted_text = "\n".join([f"{line}" for line in formatted_data])
    return formatted_text

In [4]:
def summarizeText(transcript):

    text = formatText(transcript)

    # print("\n\n", text, "\n\n")
    # print( "min: ", math.ceil(int(wordCount) * 0.1), "max: ", math.ceil(int(wordCount) * 0.25))
    print("\n\nSummarizing Text...")
    summary = summarizer(text)[0]["summary_text"]
    print("\n", summary, "\n")

    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 1),
        stop_words="english",
        highlight=False,
        top_n=5,
    )
    keywords_list_1 = list(dict(keywords).keys())
    print("1 gram keywords: ", keywords_list_1)
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(2, 2),
        stop_words="english",
        highlight=False,
        top_n=5,
    )
    keywords_list_2 = list(dict(keywords).keys())
    print("2 gram keywords: ", keywords_list_2)
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(3, 3),
        stop_words="english",
        highlight=False,
        top_n=5,
    )
    keywords_list_3 = list(dict(keywords).keys())
    print("3 gram keywords: ", keywords_list_3)

    response = {
        "transcription": formatText,
        "summary": summary,
        "keywords_list_1": keywords_list_1,
        "keywords_list_2": keywords_list_2,
        "keywords_list_3": keywords_list_3,
    }
    return response

In [5]:
def summarizeSummary(summary_input):

    wordCount = 1024  # post_data.get('wordCount-summ')

    print(
        "min: ",
        math.ceil(int(wordCount) * 0.1),
        "max: ",
        math.ceil(int(wordCount) * 0.25),
    )
    print("\n\nSummarizing again...")
    summary = summarizer(
        summary_input,
        min_length=math.ceil(int(wordCount) * 0.1),
        max_length=math.ceil(int(wordCount) * 0.25),
    )[0]["summary_text"]
    print("\n", summary, "\n")

    response = {"summary": summary}
    return response

In [38]:
transcript = pd.read_csv(
    "data/Discussion_on_Illegal_Migration_and_Border_Crisis_Bill.csv", index_col=0
)
transcript.head()

Unnamed: 0,speaker_label,start_time,end_time,text
0,spk_0,0.00015,0.586667,Lets go ahead and get to the major news here i...
1,spk_1,0.586817,0.93765,He said no self respecting senators should agr...
2,spk_2,0.953333,1.552233,So we actually had this bill came out yesterda...
3,spk_0,1.552383,4.691767,read it for yourselves and uh many Republicans...
4,spk_3,4.691917,9.377383,"Yeah, I mean, I think its pretty clear there. ..."


In [39]:
summary_transcript = summarizeText(transcript)
print(summary_transcript["summary"])



Summarizing...

 The meeting was about the border bill. The first procedural vote was on Wednesday. The group discussed the details of the new border bill, which was negotiated by Senator James Lankford, Senator Kirsten Cinema, and some members of the Democratic side. The meeting ended with a general discussion about the future of the project. 

['senators', 'senator', 'senate', 'filibuster', 'legislation']
['senate border', 'senator lee', 'consideration senate', 'senate reject', 'negotiated senator']
['senate border deal', 'senate reject understand', 'senator lankford said', 'respecting senators agree', 'says consideration senate']
The meeting was about the border bill. The first procedural vote was on Wednesday. The group discussed the details of the new border bill, which was negotiated by Senator James Lankford, Senator Kirsten Cinema, and some members of the Democratic side. The meeting ended with a general discussion about the future of the project.


In [40]:
summarized_summary = summarizeSummary(summary_transcript["summary"])
final_summary = summarized_summary["summary"]
display(final_summary)

Your max_length is set to 256, but your input_length is only 65. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)


min:  103 max:  256


Summarizing again...

 The group discussed the details of the new border bill. The first procedural vote was on Wednesday. The meeting ended with a general discussion about the future of the project. The group also discussed the logistics of the border bill, which was negotiated by Senator James Lankford, Senator Kirsten Cinema, and some members of the Democratic side. The final decision about the project was made by a round shape shape and was agreed by all of them. The team also discussed how to make sure that the project would be successful. 



'The group discussed the details of the new border bill. The first procedural vote was on Wednesday. The meeting ended with a general discussion about the future of the project. The group also discussed the logistics of the border bill, which was negotiated by Senator James Lankford, Senator Kirsten Cinema, and some members of the Democratic side. The final decision about the project was made by a round shape shape and was agreed by all of them. The team also discussed how to make sure that the project would be successful.'