In [None]:
import os
import hashlib

def hash_file(filepath):
    """Generate SHA-256 hash for the file content."""
    hasher = hashlib.sha256()
    with open(filepath, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hasher.update(chunk)
    return hasher.hexdigest()

def remove_duplicate_pdfs(directory):
    """Remove duplicate PDF files in the specified directory."""
    seen_hashes = {}
    removed_files = []

    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.pdf'):  # Only consider PDF files
                filepath = os.path.join(root, file)
                file_hash = hash_file(filepath)

                if file_hash in seen_hashes:
                    # If duplicate is found, remove the file
                    os.remove(filepath)
                    removed_files.append(filepath)
                    print(f"Removed duplicate: {filepath}")
                else:
                    # Otherwise, store the hash
                    seen_hashes[file_hash] = filepath

    if not removed_files:
        print("No duplicates found.")
    else:
        print(f"Removed {len(removed_files)} duplicate files.")

# Usage:
directory = "all_syllabi/"
remove_duplicate_pdfs(directory)


Removed duplicate: all_syllabi/APCOMP 209B_COMPSCI 109B_STAT 109B_STAT 121B - 2024 Spring (203547).pdf
Removed duplicate: all_syllabi/PHYSICS 201 - 2023 Spring (161201) (1).pdf
Removed duplicate: all_syllabi/APCOMP 209A_COMPSCI 1090A_COMPSCI 109A_STAT 109A_STAT 121A - 2024 Fall (203101) (1).pdf
Removed duplicate: all_syllabi/APCOMP 295 - 2020 Fall (215121).pdf
Removed duplicate: all_syllabi/APCOMP 227_APMTH 227 - 2019 Fall (207546) (1).pdf
