In [1]:
import os
import pandas as pd
import glob
from PIL import Image

# Load the dataframe
data = pd.read_csv('df_6.csv')

# Convert date column to datetime format
data['date'] = pd.to_datetime(data['date'])

# Sort the dataframe by date from earliest to latest
data = data.sort_values(by='date')

# Define directories
pdf_directory = r"C:\Users\clint\Desktop\Scraping Task\pdfs"
image_directory = r"C:\Users\clint\Desktop\Scraping Task\pdfs\Images"

# Function to open a PDF file
def open_pdf(pdf_path):
    if os.path.exists(pdf_path):
        os.startfile(pdf_path)
        print(f"Opening: {pdf_path}")
        return True
    else:
        print(f"File not found: {pdf_path}")
        return False

# Function to open an image file using PIL
def open_image(image_path):
    if os.path.exists(image_path):
        print(f"Opening: {image_path}")
        img = Image.open(image_path)
        img.show()
        return True
    else:
        print(f"Image not found: {image_path}")
        return False

# Debug: Print all files in image directory to check naming patterns
print("Files in image directory:")
all_images = os.listdir(image_directory)
for i, img in enumerate(all_images[:5]):  # Show first 5 images as example
    print(f"  {i+1}. {img}")
if len(all_images) > 5:
    print(f"  ...and {len(all_images)-5} more files")
print("\n")

# Iterate through each row in the dataframe (sorted by date)
for index, row in data.iterrows():
    # Get the PDF filename from the pdf_file_name column
    pdf_filename = row['pdf_filename']
    pdf_basename = os.path.splitext(pdf_filename)[0]
    
    # Display date for reference
    print(f"\nProcessing: {pdf_basename} - Date: {row['date'].strftime('%Y-%m-%d')}")
    
    # Build the full path to the PDF
    pdf_path = os.path.join(pdf_directory, pdf_filename)
    
    # Open the PDF
    if open_pdf(pdf_path):
        input(f"Opened {pdf_filename}. Press Enter to continue to associated images...")
        
        # Try multiple patterns to find associated images
        # Pattern 1: Exact prefix match (e.g., "01_2018_*.png")
        image_pattern = os.path.join(image_directory, f"{pdf_basename}_*.png")
        matching_images = glob.glob(image_pattern)
        
        # Pattern 2: Try with dashes instead of underscores in case of naming inconsistency
        if not matching_images:
            alt_basename = pdf_basename.replace('_', '-')
            image_pattern = os.path.join(image_directory, f"{alt_basename}*.png")
            matching_images.extend(glob.glob(image_pattern))
        
        # Pattern 3: Try more lenient matching (any file containing the basename)
        if not matching_images:
            # Get all image files
            all_image_files = glob.glob(os.path.join(image_directory, "*.png"))
            # Filter those containing the basename
            matching_images = [img for img in all_image_files if pdf_basename.lower() in os.path.basename(img).lower()]
        
        # Debug info
        print(f"Using pattern: {image_pattern}")
        print(f"Found {len(matching_images)} matching images")
        
        delete_count = 0
        skip_to_next_pdf = False
        
        if matching_images:
            print(f"Found {len(matching_images)} images associated with {pdf_filename}")
            
            # Process each associated image
            for img_path in matching_images:
                if skip_to_next_pdf:
                    break
                    
                img_filename = os.path.basename(img_path)
                
                # Open the image
                if open_image(img_path):
                    # Ask whether to keep or delete
                    while True:
                        decision = input(f"Image: {img_filename} - Keep, Delete, or Skip to next PDF? (k/d/s): ").lower()
                        
                        if decision == 'd' or decision == 'delete':
                            try:
                                os.remove(img_path)
                                delete_count += 1
                                print(f"Deleted: {img_filename}")
                                break
                            except Exception as e:
                                print(f"Error deleting image: {e}")
                                continue
                        elif decision == 'k' or decision == 'keep':
                            print(f"Keeping: {img_filename}")
                            break
                        elif decision == 's' or decision == 'skip':
                            print(f"Skipping to next PDF...")
                            skip_to_next_pdf = True
                            break
                        else:
                            print("Invalid input. Please enter 'k' for keep, 'd' for delete, or 's' to skip to next PDF.")
            
            # Update the note column in the dataframe
            if delete_count > 0:
                # Create or update the note
                if pd.isna(data.loc[index, 'note']):
                    data.loc[index, 'note'] = f"Manually deleted {delete_count} images"
                else:
                    data.loc[index, 'note'] += f"; Manually deleted {delete_count} images"
                
                print(f"Updated note for {pdf_filename}: {data.loc[index, 'note']}")
        else:
            print(f"No images found for {pdf_filename}")
    
    # Wait for user confirmation before proceeding to next PDF, unless we're skipping
    if not skip_to_next_pdf:
        input(f"Finished processing {pdf_filename}. Press Enter to continue to next PDF...")

# Save the updated dataframe
data.to_csv('df_6_updated.csv', index=False)
print("Process complete. Updated DataFrame saved to 'df_6_updated.csv'")

Files in image directory:
  1. 01_2018_plot_1_Average_Auction_Hammer_Price_3-_to_6-Year-Old_Benc - Copy.png
  2. 01_2018_plot_1_Average_Auction_Hammer_Price_3-_to_6-Year-Old_Benc.png
  3. 01_2018_plot_2_Average_Retail_Selling_Price_3-_to_5-Year-Old_Aero.png
  4. 01_2018_plot_3__Avg_Retail_Selling_Price_4-_to_6-Year-Old_Aerodyn.png
  5. 01_2019_plot_1_DARK_BG.jpeg
  ...and 329 more files



Processing: 01_2018 - Date: 2018-01-01
Opening: C:\Users\clint\Desktop\Scraping Task\pdfs\01_2018.pdf
Using pattern: C:\Users\clint\Desktop\Scraping Task\pdfs\Images\01_2018_*.png
Found 4 matching images
Found 4 images associated with 01_2018.pdf
Opening: C:\Users\clint\Desktop\Scraping Task\pdfs\Images\01_2018_plot_1_Average_Auction_Hammer_Price_3-_to_6-Year-Old_Benc - Copy.png
Invalid input. Please enter 'k' for keep, 'd' for delete, or 's' to skip to next PDF.
Invalid input. Please enter 'k' for keep, 'd' for delete, or 's' to skip to next PDF.


: 