In [1]:
pip install transformers torch requests PyMuPDF python-docx


Collecting PyMuPDF
  Downloading PyMuPDF-1.24.11-cp38-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading PyMuPDF-1.24.11-cp38-abi3-win_amd64.whl (16.0 MB)
   ---------------------------------------- 0.0/16.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/16.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/16.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/16.0 MB 220.2 kB/s eta 0:01:13
   ---------------------------------------- 0.1/16.0 MB 365.7 kB/s eta 0:00:44
   ---------------------------------------- 0.1/16.0 MB 416.7 kB/s eta 0:00:39
   ---------------------------------------- 0.1/16.0 MB 416.7 kB/s eta 0:00:39
   ---------------------------------------- 0.1/16.0 MB 416.7 kB/s eta 0:00:39
   ---------------------------------------- 0.1/16.0 MB 416.7 kB/s eta 0:00:39
   ---------------------------------------- 0.1/16.0 MB 416.7 kB/s eta 0:

In [3]:
import os
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'


In [5]:
# Step 1: Import required libraries
from transformers import pipeline
from docx import Document
import fitz  # PyMuPDF for PDF handling
import os

# Step 2: Define a function to read text from various file formats
def read_text_from_file(file_path):
    """
    Reads text from .txt, .pdf, or .docx file.

    Parameters:
    - file_path (str): Path to the file.

    Returns:
    - content (str): Extracted text content.
    """
    content = ""
    if file_path.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()
    elif file_path.endswith(".pdf"):
        with fitz.open(file_path) as doc:
            for page in doc:
                content += page.get_text()
    elif file_path.endswith(".docx"):
        doc = Document(file_path)
        content = "\n".join([para.text for para in doc.paragraphs])
    else:
        raise ValueError("Unsupported file format. Please use .txt, .pdf, or .docx files.")
    
    return content

# Step 3: Define a function for summarizing the text using a local model
def summarize_text(text, max_length=150, min_length=40):
    """
    Summarizes the given text using a T5 model.

    Parameters:
    - text (str): The input text to summarize.
    - max_length (int): The maximum length of the summary.
    - min_length (int): The minimum length of the summary.

    Returns:
    - summary (str): The generated summary.
    """
    # Load a pre-trained T5 model and tokenizer for summarization
    summarization_pipeline = pipeline("summarization", model="t5-small", tokenizer="t5-small")
    
    # Perform summarization
    summary = summarization_pipeline(text, max_length=max_length, min_length=min_length, do_sample=False)
    
    # Extract the summary text
    return summary[0]['summary_text']

# Step 4: Define a function to save the summary in the same format
def save_summary_to_file(summary, original_file_path, output_path):
    """
    Saves the summary to a file in the same format as the original file.

    Parameters:
    - summary (str): The generated summary.
    - original_file_path (str): Path to the original file.
    - output_path (str): Path where the summary should be saved.
    """
    if original_file_path.endswith(".txt"):
        with open(output_path, "w", encoding="utf-8") as file:
            file.write(summary)
    elif original_file_path.endswith(".pdf"):
        doc = fitz.open()
        doc.insert_page(-1, text=summary)
        doc.save(output_path)
    elif original_file_path.endswith(".docx"):
        doc = Document()
        doc.add_paragraph(summary)
        doc.save(output_path)
    else:
        raise ValueError("Unsupported file format for saving. Please use .txt, .pdf, or .docx files.")

# Step 5: Example: Upload a text file and summarize
file_path = "sample_article (1).docx"  # Replace with your file path

try:
    # Read the content from the file
    original_text = read_text_from_file(file_path)
    print("Original Text Loaded.")

    # Summarize the text
    summary = summarize_text(original_text)
    print("Summary Generated.")

    # Define the output path for saving the summary
    output_file_path = f"summary_{os.path.basename(file_path)}"

    # Save the summary to a file in the same format
    save_summary_to_file(summary, file_path, output_file_path)
    print(f"Summary saved at: {output_file_path}")
except Exception as e:
    print(f"An error occurred: {e}")


Original Text Loaded.
Summary Generated.
Summary saved at: summary_sample_article (1).docx
