In [11]:
%micromamba install -c conda-forge PyPDF2 tabulate tqdm -y

conda-forge/osx-arm64                                       Using cache
conda-forge/noarch                                          Using cache

Pinned packages:
  - python 3.12.*


Transaction

  Prefix: /Users/winnaries/micromamba/envs/probstat_spring25

  All requested packages already installed


Transaction starting
[?25l[2K[0G[?25h
Transaction finished

To activate this environment, use:

    micromamba activate probstat_spring25

Or to execute a single command in this environment, use:

    micromamba run -n probstat_spring25 mycommand


Note: you may need to restart the kernel to use updated packages.


In [8]:
import PyPDF2
import os 
import csv
import tabulate
from tqdm import tqdm

In [23]:
submission_path = '/Users/winnaries/personal/probstat_spring25/submission'

def get_latest_pdf_path(folder): 
    latest_file = None
    latest_time = 0
    for file in os.listdir(folder): 
        if file.endswith('.pdf'): 
            time = file.split('-')[-1].split('.')[0]
            if int(time) > latest_time: 
                latest_time = int(time)
                latest_file = os.path.join(folder, file)
    return latest_file

def get_submission_list(folders): 
    homework_files = []
    for folder in folders: 
        for student_id in os.listdir(folder): 
            abs_path = os.path.join(folder, student_id)
            if not os.path.isdir(abs_path): 
                continue
            pdf_path = get_latest_pdf_path(abs_path)
            if pdf_path: 
                homework_files.append((student_id, pdf_path))
    return sorted(homework_files, key=lambda x: x[0])

def merge_submissions(submissions, filename='HW1'): 
    merger = PyPDF2.PdfMerger()
    page_map = [] # (idx, student_id, page_num, page_len)
    tot_pages = 0
    
    for student_id, pdf_path in tqdm(submissions, total=len(submissions)): 
        try: 
            reader = PyPDF2.PdfReader(pdf_path)
            num_page = len(reader.pages)
            merger.append(pdf_path)
            merger.add_outline_item(title=student_id, page_number=tot_pages)
            page_map.append((len(page_map), student_id, tot_pages, num_page))
            tot_pages += num_page
        except PyPDF2.errors.EmptyFileError as e: 
            page_map.append((len(page_map), student_id, tot_pages, 0))

    print("Saving merged PDF to", os.path.join(submission_path, f"{filename}.pdf"))

    merger.write(os.path.join(submission_path, f"{filename}.pdf"))
    merger.close()

    print("Saving page map to", os.path.join(submission_path, f"{filename}-log.csv"))

    with open(os.path.join(submission_path, f"{filename}-log.csv"), 'w') as f: 
        writer = csv.writer(f)
        writer.writerow(['Index', 'Student ID', 'Start Page', 'Page Length'])
        writer.writerows(page_map)

    return tabulate.tabulate(page_map, headers=['Index', 'Student ID', 'Start Page', 'Page Length'], tablefmt='psql')

def unmerge_submissions(filename='HW1', suffix='graded'): 
    page_map = []
    with open(os.path.join(submission_path, f"{filename}-log.csv"), 'r') as f: 
        reader = csv.reader(f)
        for i, row in enumerate(reader): 
            if i == 0: 
                continue
            page_map.append((int(row[0]), row[1], int(row[2]), int(row[3])))
    
    _suffix = f"-{suffix}" if suffix else ""
    print("Loading merged PDF from", os.path.join(submission_path, f"{filename}{_suffix}.pdf"))
    reader = PyPDF2.PdfReader(os.path.join(submission_path, f"{filename}{_suffix}.pdf"))
    
    print("Unmerging PDFs")
    for idx, student_id, start_page, page_len in tqdm(page_map): 
        writer = PyPDF2.PdfWriter()
        for i in range(start_page, start_page+page_len): 
            writer.add_page(reader.pages[i])
        save_folder = os.path.join(submission_path, f"{filename}{_suffix}")
        if not os.path.exists(save_folder): 
            os.mkdir(save_folder)
        with open(os.path.join(save_folder, f"{filename}-{student_id}.pdf"), 'wb') as f: 
            writer.write(f)

In [10]:
folders = [
    '/Users/winnaries/Downloads/attachments', 
    '/Users/winnaries/Downloads/attachments-2',
    '/Users/winnaries/Downloads/attachments-3'
]

submission_list = get_submission_list(folders)
print(merge_submissions(submission_list, 'HW1'))

100%|██████████| 105/105 [00:00<00:00, 310.66it/s]


Saving merged PDF to /Users/winnaries/personal/probstat_spring25/submission/HW1.pdf
Saving page map to /Users/winnaries/personal/probstat_spring25/submission/HW1-log.csv
+---------+--------------+--------------+---------------+
|   Index |   Student ID |   Start Page |   Page Length |
|---------+--------------+--------------+---------------|
|       0 |   6438229421 |            0 |             2 |
|       1 |   6638014021 |            2 |             4 |
|       2 |   6638058621 |            6 |             2 |
|       3 |   6638070021 |            8 |             2 |
|       4 |   6638076921 |           10 |             2 |
|       5 |   6638079821 |           12 |             3 |
|       6 |   6638080321 |           15 |             2 |
|       7 |   6638105421 |           17 |             2 |
|       8 |   6638113421 |           19 |             2 |
|       9 |   6638199421 |           21 |             4 |
|      10 |   6638248521 |           25 |             3 |
|      11 |   6738

In [24]:
unmerge_submissions('HW1', 'test')

Loading merged PDF from /Users/winnaries/personal/probstat_spring25/submission/HW1-test.pdf
Unmerging PDFs


100%|██████████| 105/105 [00:31<00:00,  3.38it/s]
