In [1]:
%micromamba install -c conda-forge PyPDF2 tabulate tqdm -y

[?25l[2K[0G[+] 0.0s
[2K[1A[2K[0G[+] 0.1s
conda-forge/osx-arm64 [33m━━━━━━━━╸[0m[90m━━━━━━━━━━━━━[0m   0.0 B /  ??.?MB @  ??.?MB/s  0.1s
conda-forge/noarch    [33m━━━━━━━━╸[0m[90m━━━━━━━━━━━━━[0m   0.0 B /  ??.?MB @  ??.?MB/s  0.1s[2K[1A[2K[1A[2K[0G[+] 0.2s
conda-forge/osx-arm64 [90m━━━━━━━━━━━━━━━━━━━━━━[0m  42.7kB /  14.1MB @ 259.3kB/s  0.2s
conda-forge/noarch    [33m━━━━━━━━━━╸[0m[90m━━━━━━━━━━━[0m   0.0 B /  ??.?MB @  ??.?MB/s  0.2s[2K[1A[2K[1A[2K[0G[+] 0.3s
conda-forge/osx-arm64 [90m━━━━━━━━━━━━━━━━━━━━━━[0m 359.4kB /  14.1MB @   1.3MB/s  0.3s
conda-forge/noarch    ━━╸[90m━━━━━━━━━━━━━━━━━━━[0m   3.3MB /  18.9MB @  11.7MB/s  0.3s[2K[1A[2K[1A[2K[0G[+] 0.4s
conda-forge/osx-arm64 ╸[90m━━━━━━━━━━━━━━━━━━━━━[0m   1.4MB /  14.1MB @   3.5MB/s  0.4s
conda-forge/noarch    ━━━━━━━━╸[90m━━━━━━━━━━━━━[0m   8.1MB /  18.9MB @  20.6MB/s  0.4s[2K[1A[2K[1A[2K[0G[+] 0.5s
conda-forge/osx-arm64 ━╸[90m━━━━━━━━━━━━━━━━━━━━[0m   1.9MB /  14.1MB @   4

In [5]:
import PyPDF2
import os 
import csv
import tabulate
from tqdm import tqdm

In [20]:
config = dict(
    name='HW1', 
    section=1, 
    suffix='graded',
    operation='UNMERGE',  
    folders=[
        '/Users/winnaries/Downloads/attachments-4',
        '/Users/winnaries/Downloads/attachments-5'
    ]
)

base_folder = '/Users/winnaries/personal/probstat_spring25/submission'
submission_path = os.path.join(base_folder, f'section_{config["section"]}')

In [21]:
def get_latest_pdf_path(folder): 
    latest_file = None
    latest_time = 0
    for file in os.listdir(folder): 
        if file.endswith('.pdf'): 
            time = file.split('-')[-1].split('.')[0]
            if int(time) > latest_time: 
                latest_time = int(time)
                latest_file = os.path.join(folder, file)
    return latest_file

def get_submission_list(folders): 
    homework_files = []
    for folder in folders: 
        for student_id in os.listdir(folder): 
            abs_path = os.path.join(folder, student_id)
            if not os.path.isdir(abs_path): 
                continue
            pdf_path = get_latest_pdf_path(abs_path)
            if pdf_path: 
                homework_files.append((student_id, pdf_path))
    return sorted(homework_files, key=lambda x: x[0])

def merge_submissions(submissions, filename='HW1'): 
    merger = PyPDF2.PdfMerger()
    page_map = [] # (idx, student_id, page_num, page_len)
    tot_pages = 0
    
    for student_id, pdf_path in tqdm(submissions, total=len(submissions)): 
        try: 
            reader = PyPDF2.PdfReader(pdf_path)
            num_page = len(reader.pages)
            merger.append(pdf_path)
            merger.add_outline_item(title=student_id, page_number=tot_pages)
            page_map.append((len(page_map), student_id, tot_pages, num_page))
            tot_pages += num_page
        except PyPDF2.errors.EmptyFileError as e: 
            page_map.append((len(page_map), student_id, tot_pages, 0))

    print("Saving merged PDF to", os.path.join(submission_path, f"{filename}.pdf"))

    merger.write(os.path.join(submission_path, f"{filename}.pdf"))
    merger.close()

    print("Saving page map to", os.path.join(submission_path, f"{filename}-log.csv"))

    with open(os.path.join(submission_path, f"{filename}-log.csv"), 'w') as f: 
        writer = csv.writer(f)
        writer.writerow(['Index', 'Student ID', 'Start Page', 'Page Length'])
        writer.writerows(page_map)

    return tabulate.tabulate(page_map, headers=['Index', 'Student ID', 'Start Page', 'Page Length'], tablefmt='psql')

def unmerge_submissions(filename='HW1', suffix='graded'): 
    page_map = []
    with open(os.path.join(submission_path, f"{filename}-log.csv"), 'r') as f: 
        reader = csv.reader(f)
        for i, row in enumerate(reader): 
            if i == 0: 
                continue
            page_map.append((int(row[0]), row[1], int(row[2]), int(row[3])))
    
    _suffix = f"-{suffix}" if suffix else ""
    print("Loading merged PDF from", os.path.join(submission_path, f"{filename}{_suffix}.pdf"))
    reader = PyPDF2.PdfReader(os.path.join(submission_path, f"{filename}{_suffix}.pdf"))
    
    print("Unmerging PDFs")
    for idx, student_id, start_page, page_len in tqdm(page_map): 
        writer = PyPDF2.PdfWriter()
        for i in range(start_page, start_page+page_len): 
            writer.add_page(reader.pages[i])
        save_folder = os.path.join(submission_path, f"{filename}{_suffix}")
        if not os.path.exists(save_folder): 
            os.mkdir(save_folder)
        with open(os.path.join(save_folder, f"{filename}-{student_id}.pdf"), 'wb') as f: 
            writer.write(f)

In [22]:
if config['operation'] == 'MERGE': 
    submission_list = get_submission_list(config['folders'])
    print(merge_submissions(submission_list, config['name']))
elif config['operation'] == 'UNMERGE':
    unmerge_submissions(config['name'], config['suffix'])

Loading merged PDF from /Users/winnaries/personal/probstat_spring25/submission/section_1/HW1-graded.pdf
Unmerging PDFs


100%|██████████| 105/105 [00:03<00:00, 26.42it/s]
