In [None]:
!pip install pymupdf


In [None]:
!pip install pdfplumber

In [None]:
import os
import fitz
import pdfplumber  # For table extraction
import pandas as pd
from transformers import pipeline

In [None]:
from huggingface_hub import login

In [None]:
# Replace 'your-hf-token' with your actual Hugging Face API token
os.environ["HF_TOKEN"] = "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"

In [None]:
# Login using the token
login(token=os.environ["HF_TOKEN"])

# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="t5-large", tokenizer="t5-large")

In [None]:

def get_page_and_table_summary(page_text, table):
    """
    Generate a prompt for summarization by combining page text with table information.
    """
    prompt = "Summarize the following content along with the table:\n\n"
    prompt += page_text[:500] + "\n"  # Include the first 500 characters of the page
    prompt += "Table:\n"

    for row in table:
        row_text = " | ".join([str(cell) for cell in row])
        prompt += row_text + "\n"

    return prompt.strip()

In [None]:


'''def get_prompt_table_summarization(table):
    """
    Generate a prompt for table summarization.
    """
    prompt = "Summarize the following table:\n"

    # Convert the table rows to text
    for row in table:
        if row:  # Ensure the row is not empty
            row_text = " | ".join([str(cell) for cell in row])  # Format each row with '|' separator
            prompt += row_text + "\n"

    return prompt.strip()  # Remove any trailing spaces/newlines'''

def generate_text(prompt):
    """
    Generate a summary of the table using the LLM model.
    """
    if not prompt:  # Check if prompt is empty
        print("Empty prompt, skipping summarization.")
        return "Empty prompt"

    try:
        input_length = len(prompt.split())
        print(f"Input Length: {input_length}")

        # Set max_length to encourage concise summaries
        max_length = min(150, input_length) if input_length > 1 else 5

        # Summarize the prompt using the LLM summarization model
        summary = summarizer(prompt, max_length=max_length, min_length=5, do_sample=False)
        #summary = summarizer(prompt, max_length=150, min_length=30, temperature=0.7, top_k=50)[0]['summary_text']

        print(f"Summary: {summary}")

        return summary[0]['summary_text'] if summary else "No summary generated"
    except Exception as e:
        print(f"Error during model inference: {str(e)}")
        return "Error generating summary"

def extract_table_data_from_pdfs(pdf_directory):
    data = []

    # List all PDF files in the specified directory
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]

    # Process each PDF file
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf_file)
        print(f"Processing {pdf_file}...")

        # Open PDF using pdfplumber for table extraction
        with pdfplumber.open(pdf_path) as pdf:
            for i in range(len(pdf.pages)):
                try:
                    page_plumber = pdf.pages[i]
                    tables = page_plumber.extract_tables()

                    # Extract the page text using fitz (PyMuPDF)
                    pdf_fitz = fitz.open(pdf_path)
                    page_text = pdf_fitz[i].get_text()

                    if tables:
                        for table in tables:
                            if len(table) > 2:  # Filter out very small tables
                                # Generate a prompt that includes both page text and table data
                                prompt = get_page_and_table_summary(page_text, table)
                                if prompt:  # Proceed only if the prompt is not empty
                                    table_summary = generate_text(prompt)

                                    # Collecting the data: PDF name, page number, and table summary
                                    data.append({
                                        "pdf_name": pdf_file,
                                        "page_number": i + 1,
                                        "table_summary": table_summary
                                    })
                                    print(f"Table summarized on page {i + 1} of {pdf_file}")

                except Exception as e:
                    print(f"Error on page {i + 1} of {pdf_file}: {str(e)}")
                    continue

    # Convert data to DataFrame and save to CSV
    df = pd.DataFrame(data)
    output_csv_path = os.path.join(pdf_directory, "extracted_table_summaries.csv")
    df.to_csv(output_csv_path, index=False)
    print(f"Data saved to {output_csv_path}")

# Example usage
pdf_directory_path = "/content/drive/MyDrive/ASAPP/papers"  # Change this to your PDF directory
extract_table_data_from_pdfs(pdf_directory_path)


In [None]:
df = pd.read_csv("/content/drive/MyDrive/ASAPP/dummy/extracted_table_summaries.csv")

In [None]:
df.head()

In [None]:
df.tail()