In [None]:
from pdfminer.high_level import extract_text
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation as punctuations
from heapq import nlargest
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Extractive Part------------------------------------------
stopwords = list(STOP_WORDS)
nlp = spacy.load('en_core_web_sm')

def generate_extractive_summary(doc):
    tokens = [token.text for token in doc]

    punctuation = punctuations + '\n'

    word_frequencies = {}
    for word in doc:
        if word.text.lower() not in stopwords:
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1

    if not word_frequencies:
        return "No valid words in the document."

    max_frequency = max(word_frequencies.values())

    for word in word_frequencies.keys():
        word_frequencies[word] = word_frequencies[word] / max_frequency

    sentence_tokens = [sent for sent in doc.sents]

    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent] += word_frequencies[word.text.lower()]

    select_length = int(len(sentence_tokens) * 0.3)
    summary = nlargest(select_length, sentence_scores, key=sentence_scores.get)
    final_summary = [word.text for word in summary]
    return ' '.join(final_summary)

def generate_abstractive_summary(encoding):
    generated_ids = model.generate(
        input_ids=encoding['input_ids'].to(device),
        attention_mask=encoding['attention_mask'].to(device),
        max_length=150,  # Adjust the max_length as needed
        num_beams=4,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

# Abstractive Part-------------------------------------------
checkpoint_path = 't5_English_final.pth'
checkpoint = torch.load(checkpoint_path, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

model = T5ForConditionalGeneration.from_pretrained("t5-base")
model.load_state_dict(checkpoint['model_state_dict'])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.eval()
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Create a file to store the final abstractive summaries
output_file_path = 'final_abstractive_summaries.txt'

with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for idx, section_data in enumerate(section_data_list):
        section_doc = nlp(section_data)

        # Generate extractive summary
        extractive_summary = generate_extractive_summary(section_doc)

        # Generate abstractive summary for the extractive summary
        encoding = tokenizer(
            extractive_summary,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        abstractive_summary = generate_abstractive_summary(encoding)

        # Write the summaries to the output file
        output_file.write(f"Section {idx + 1}\n{abstractive_summary}\n\n")

print("Final abstractive summaries saved to 'final_abstractive_summaries.txt'.")
