In [None]:
#pip install nest_asyncio PyMuPDF pandas llama-index llama-index-extractors-entity

#Document Metadata using SimpleDirectoryReader
#Ensure to install the required packages

import os
import fitz  # PyMuPDF
import pandas as pd
import nest_asyncio
from llama_index.core import SimpleDirectoryReader
from llama_index.core import Document
from llama_index.llms.openai import OpenAI
from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
    SummaryExtractor,
    KeywordExtractor,
)
from llama_index.extractors.entity import EntityExtractor
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

nest_asyncio.apply()

# Path to the local directory containing the PDF files
local_directory = "/Users/pradhikshasuresh/Documents/Python/Chatbot"

# Use SimpleDirectoryReader to load the documents from the local directory
reader = SimpleDirectoryReader(input_dir=local_directory)  # Adjust chunk_size as needed
documents = reader.load_data()

# Limit to the first 2 documents
documents = documents[:3]

# Setup OpenAPI key
os.environ["OPENAI_API_KEY"] = "My_OpenAI_key"
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.2)

# Define Extractors
entity_extractor = EntityExtractor(
    prediction_threshold=0.5,
    label_entities=False,  
    device="cpu",  
)
qa_extractor = QuestionsAnsweredExtractor(questions=3, llm=llm)
summary_extractor = SummaryExtractor(summaries=["self"], llm=llm)
title_extractor = TitleExtractor(nodes=5, llm=llm)
keyword_extractor = KeywordExtractor(keywords=10, llm=llm)
node_parser = SentenceSplitter()

transformations = [node_parser, title_extractor, entity_extractor, summary_extractor, qa_extractor, keyword_extractor]

# Create the ingestion pipeline
pipeline = IngestionPipeline(transformations=transformations)

# Run the pipeline to extract metadata
nodes = pipeline.run(documents=documents)

# Prepare data for CSV
metadata_list = []
for node in nodes:
    metadata = node.metadata
    metadata_list.append(metadata)

# Convert metadata to a DataFrame
df = pd.DataFrame(metadata_list)

# Save DataFrame to CSV
output_csv_path = "docmetadata_output.csv"
df.to_csv(output_csv_path, index=False)

# Print the location where the CSV is saved
print(f"Metadata has been saved to {output_csv_path}")
print(f"Full path: {os.path.abspath(output_csv_path)}")

In [None]:
#Document metadata using pdf loader
import os
import pandas as pd
from llama_index.core import Document
from llama_index.llms.openai import OpenAI
from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
    SummaryExtractor,
    KeywordExtractor,
)
from llama_index.extractors.entity import EntityExtractor
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline
import fitz  # PyMuPDF for PDF handling
import time  # Import time module for sleep function

# Path to the local directory containing the PDF files
local_directory = "/Users/pradhikshasuresh/Documents/Python/Chatbot"

# List all PDF files in the directory
pdf_files = [file for file in os.listdir(local_directory) if file.endswith(".pdf")]

# Limit to the first two PDF files for testing
pdf_files = pdf_files[:2]

# Setup OpenAI key if needed
os.environ["OPENAI_API_KEY"] = "My_OpenAI_key"
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.2)

# Define Extractors
entity_extractor = EntityExtractor(
    prediction_threshold=0.5,
    label_entities=False,
    device="cpu",
)
qa_extractor = QuestionsAnsweredExtractor(questions=3, llm=llm)
summary_extractor = SummaryExtractor(summaries=["prev", "self"], llm=llm)
title_extractor = TitleExtractor(nodes=5, llm=llm)
keyword_extractor = KeywordExtractor(keywords=10, llm=llm)
node_parser = SentenceSplitter()

transformations = [node_parser, title_extractor, entity_extractor, summary_extractor, qa_extractor, keyword_extractor]

# Create the ingestion pipeline
pipeline = IngestionPipeline(transformations=transformations)

# Prepare to store metadata
metadata_list = []

# Process each PDF file
for pdf_file in pdf_files:
    # Construct full path to the PDF file
    pdf_path = os.path.join(local_directory, pdf_file)
    
    # Load the PDF content using PyMuPDF (fitz)
    doc = fitz.open(pdf_path)
    pdf_text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pdf_text += page.get_text()
    
    # Create Document object for the PDF
    document = Document(text=pdf_text)
    
    # Run the ingestion pipeline to extract metadata
    nodes = pipeline.run(documents=[document])
    
    # Store metadata
    metadata_list = []
    for node in nodes:
        metadata = node.metadata
        metadata_list.append(metadata)

# Convert metadata to a DataFrame
df = pd.DataFrame(metadata_list)

# Save DataFrame to CSV
output_csv_path = "docmetadata1_output.csv"
df.to_csv(output_csv_path, index=False)

# Print the location where the CSV is saved
print(f"Metadata has been saved to {output_csv_path}")
print(f"Full path: {os.path.abspath(output_csv_path)}")