In [8]:
import os
import re
import PyPDF2
import openai
from dotenv import load_dotenv
from openai import OpenAI

# Load environment variables
load_dotenv()

# Set up OpenAI API key
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def sanitize_filename(filename):
    """
    Remove or replace invalid characters from a filename.
    
    Args:
    filename (str): The filename to sanitize
    
    Returns:
    str: A sanitized filename
    """
    # Remove any characters that are not allowed in Windows filenames
    return re.sub(r'[<>:"/\\|?*]', '', filename)

def extract_pdf_content(pdf_path):
    """
    Extract text content from a PDF file.
    
    Args:
    pdf_path (str): Path to the PDF file
    
    Returns:
    str: Extracted text content from the PDF
    """
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        content = ""
        for page in reader.pages:
            content += page.extract_text()
    return content

def process_pdfs(directory):
    """
    Process all PDF files in a directory and store their content.
    
    Args:
    directory (str): Path to the directory containing PDF files
    
    Returns:
    dict: A dictionary with PDF filenames as keys and their content as values
    """
    pdf_contents = {}
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(directory, filename)
            content = extract_pdf_content(pdf_path)
            pdf_contents[filename] = content
    return pdf_contents

def generate_filename(content):
    """
    Generate a filename based on the content using OpenAI's GPT model.
    
    Args:
    content (str): The content of the PDF
    
    Returns:
    str: A generated filename
    """
    prompt = f"Based on the following content, suggest a concise and descriptive filename (max 50 characters, including .pdf extension):\n\n{content[:1000]}"
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that generates concise and descriptive filenames based on document content."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=60,
        n=1,
        stop=None,
        temperature=0.7,
    )
    
    suggested_filename = response.choices[0].message.content.strip()
    
    # Ensure the filename ends with .pdf and is not longer than 50 characters
    if not suggested_filename.lower().endswith('.pdf'):
        suggested_filename += '.pdf'
    return sanitize_filename(suggested_filename[:50])

# Specify the directory containing your PDF files
pdf_directory = 'C:\\Users\\samar\\Desktop\\Elementals.ai\\Renaming_Pdf\\pdfs'

# Process PDFs and store their content
pdf_contents = process_pdfs(pdf_directory)

# Generate new filenames for each PDF
new_filenames = {}
for filename, content in pdf_contents.items():
    new_filename = generate_filename(content)
    new_filenames[filename] = new_filename

# Print the results
for old_filename, new_filename in new_filenames.items():
    print(f"Old filename: {old_filename}")
    print(f"New filename: {new_filename}")
    print("-" * 50)

# Rename the files
for old_filename, new_filename in new_filenames.items():
    old_path = os.path.join(pdf_directory, old_filename)
    new_path = os.path.join(pdf_directory, new_filename)
    try:
        os.rename(old_path, new_path)
        print(f"Renamed: {old_filename} -> {new_filename}")
    except OSError as e:
        print(f"Error renaming file: {e}")


Old filename: 5_6206478766378586158.pdf
New filename: SBI_Card_Account_Summary_2022.pdf
--------------------------------------------------
Renamed: 5_6206478766378586158.pdf -> SBI_Card_Account_Summary_2022.pdf
