In [None]:
!pip install langchain sentence-transformers langchain-community

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the tokenizer and model directly
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to(device)

In [3]:
import json
from langchain_text_splitters import RecursiveJsonSplitter

In [4]:
doc_path = '/content/corpus.json'
json_content = json.load(open(doc_path))

In [5]:
def summarize_text(article, max_length=150, min_length=30):
    # Tokenize the input text
    inputs = tokenizer.encode("summarize: " + article, return_tensors="pt", max_length=1024, truncation=True)

    # Generate the summary using the model
    summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode the generated summary tokens back into text
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [None]:
summarized_json = []

# Iterate through the documents, summarize, and update the 'body' field
for idx, doc in enumerate(json_content):
    article = doc.get('body', '')  # Safely get 'body', defaults to empty string if missing
    print("Status: ", idx)

    if len(article.strip()) > 0:  # Ensure the article has content to summarize
        try:
            # Summarize the article
            summary = summarize_text(article)

            # Update the 'body' field with the summarized text
            doc['fact'] = summary
            del doc['body']
        except Exception as e:
            print(f"Error summarizing document: {e}")

    # Append the updated document to summarized_json
    summarized_json.append(doc)

In [10]:
with open('summarized.json', 'w') as json_file:
    json_file.write(json.dumps(summarized_json, indent=4))