In [2]:
!pip install PyPDF2
!pip install google-generativeai

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [4]:
import os
import google.generativeai as genai
import PyPDF2
import csv

# Configure the API key
os.environ["GOOGLE_API_KEY"] = "AIzaSyBF233-HgvQ7zamCvxN9VGI5DrbU_mgyvw"
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

# Function to extract the first two pages of text from a PDF
def extract_first_two_pages(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for i in range(min(2, len(reader.pages))):  # Ensure to process only the first two pages
            page = reader.pages[i]
            text += page.extract_text()
    return text

# Function to process the extracted text using Gemini's GenerativeModel
def process_text_with_gemini(text):
    model = genai.GenerativeModel('gemini-1.5-flash')

    # Define the prompt for the model
    prompt = f"""
    Extract the following from the provided document text:
    - Title
    - Author (compulsory)
    - Contact information (if present) (mail only)
    - Abstract (exact words)

    Document text:
    {text}
    """

    response = model.generate_content(prompt)
    return response.text.strip()

# Function to parse the structured Gemini output into dictionary format
def parse_extracted_info(extracted_info):
    result = {
        "Title": "",
        "Author": "",
        "Contact Information": "",
        "Abstract": ""
    }

    # Split the output into lines and process them
    lines = extracted_info.splitlines()
    current_section = None

    for line in lines:
        line = line.strip()

        if line.startswith("**Title:**"):
            current_section = "Title"
            result["Title"] = line.replace("**Title:**", "").strip()
        elif line.startswith("**Author:**"):
            current_section = "Author"
            result["Author"] = line.replace("**Author:**", "").strip()
        elif line.startswith("**Contact Information:**"):
            current_section = "Contact Information"
            current_contact = []
        elif line.startswith("-") and current_section == "Contact Information":
            current_contact.append(line.replace("-", "").strip())
            result["Contact Information"] = "; ".join(current_contact)
        elif line.startswith("**Abstract:**"):
            current_section = "Abstract"
            result["Abstract"] = line.replace("**Abstract:**", "").strip()
        elif current_section == "Abstract":
            result["Abstract"] += " " + line  # Append additional lines of the abstract

    return result

# Function to process a single PDF file and return the result as a dictionary
def process_single_pdf(pdf_path, filename):
    print(f"Processing {filename}...")

    # Step 1: Extract the first two pages of the PDF
    text = extract_first_two_pages(pdf_path)

    # Step 2: Use Gemini to identify title, author, contact, and abstract
    extracted_info = process_text_with_gemini(text)

    # Step 3: Parse the extracted information
    parsed_info = parse_extracted_info(extracted_info)

    # Add the filename to the parsed info
    parsed_info["Filename"] = filename

    return parsed_info

# Function to process selected PDFs and store the results in a CSV file
def process_selected_pdfs_and_store_in_csv(folder_path, output_csv):
    # Get a list of all PDF files in the folder
    pdf_files = [file for file in os.listdir(folder_path) if file.endswith('.pdf')]

    # Open the CSV file for appending
    write_header = not os.path.exists(output_csv)  # Write header only if file does not exist
    with open(output_csv, mode='a', newline='', encoding='utf-8') as csv_file:
        fieldnames = ["Filename", "Title", "Author", "Contact Information", "Abstract"]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        # Write the header row only if it's a new file
        if write_header:
            writer.writeheader()

        # Process each PDF in the folder
        for pdf_file in pdf_files:
            pdf_path = os.path.join(folder_path, pdf_file)
            result = process_single_pdf(pdf_path, pdf_file)

            # Write the result to the CSV
            writer.writerow(result)

# Main function
def main():
    folder_path = "/content/paper_samples"  # Define the path to the folder containing PDFs
    output_csv = "meta_Data.csv"  # Define the output CSV file name
    process_selected_pdfs_and_store_in_csv(folder_path, output_csv)

# Example usage:
if __name__ == "__main__":
    main()


Processing 2001.08361.pdf...
Processing 2009.06489.pdf...
Processing 1409.3215v3.pdf...
