In [22]:
import pandas as pd
import os
from pathlib import Path
from tqdm import tqdm

In [None]:
CSV_FILE_PATH = "/Users/tanmayagarwal/Desktop/My_Computer/Columbia/Fall_2025/RA_LLM/main_folder/chapter_ranges_physics.csv"
PAGES_DIR = "/Users/tanmayagarwal/Desktop/My_Computer/Columbia/Fall_2025/RA_LLM/main_folder/physics_pages"
OUTPUT_DIR = "/Users/tanmayagarwal/Desktop/My_Computer/Columbia/Fall_2025/RA_LLM/main_folder/physics_chapters_v2"  


PAGE_FILENAME_PATTERN = "page_{:04d}.md"
CHAPTER_FILENAME_PATTERN = "chapter_{:02d}.md"

In [24]:
df = pd.read_csv(CSV_FILE_PATH)

print("CSV loaded successfully!")
print(f"\nNumber of chapters: {len(df)}")
print("\nFirst 5 rows:")
print(df.head())

required_columns = ['title', 'start', 'end']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    raise ValueError(f"Missing required columns: {missing_columns}")

CSV loaded successfully!

Number of chapters: 34

First 5 rows:
                                               title  start  end
0  Chapter 01 - Introduction: The Nature of Scien...     23   54
1                            Chapter 02 - Kinematics     55  116
2            Chapter 03 - Two-Dimensional Kinematics    117  164
3  Chapter 04 - Dynamics: Force and Newton's Laws...    165  214
4  Chapter 05 - Further Applications of Newton's ...    215  244


In [None]:
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
print(f"Output directory created/verified: {OUTPUT_DIR}")

Output directory created/verified: /Users/tanmayagarwal/Desktop/My_Computer/Columbia/Fall_2025/RA_LLM/main_folder/physics_chapters_v2


In [26]:
def combine_pages_to_chapter(chapter_index, title, start_page, end_page, pages_dir, output_dir):
    chapter_content = []
    pages_combined = 0
    
    chapter_content.append(f"# {title}\n")
    
    for page_num in range(start_page, end_page + 1):
        page_filename = PAGE_FILENAME_PATTERN.format(page_num)
        page_path = os.path.join(pages_dir, page_filename)
        
        if os.path.exists(page_path):
            try:
                with open(page_path, 'r', encoding='utf-8') as f:
                    page_content = f.read()
                    
                if page_content.strip():
                    chapter_content.append(page_content)
                    pages_combined += 1
            except Exception as e:
                print(f"Warning: Could not read {page_filename}: {e}")
        else:
            print(f"Warning: Page file not found: {page_filename}")
    
    if pages_combined > 0:
        chapter_filename = CHAPTER_FILENAME_PATTERN.format(chapter_index)
        output_path = os.path.join(output_dir, chapter_filename)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(''.join(chapter_content))
        
        print(f"Created {chapter_filename}: {pages_combined} pages combined")
    else:
        print(f"Warning: No pages found for Chapter {chapter_index} ({title})")
    
    return pages_combined

In [None]:

total_pages = 0
total_chapters = 0

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing chapters"):
    chapter_num = idx + 1
    title = row['title']
    start = int(row['start'])
    end = int(row['end'])
    
    pages_count = combine_pages_to_chapter(
        chapter_num, title, start, end, PAGES_DIR, OUTPUT_DIR
    )
    
    total_pages += pages_count
    if pages_count > 0:
        total_chapters += 1

print(f"\n{'='*50}")
print(f"SUMMARY")
print(f"{'='*50}")
print(f"Total chapters processed: {total_chapters}")
print(f"Total pages combined: {total_pages}")

Processing chapters:  38%|███▊      | 13/34 [00:00<00:00, 129.74it/s]

Created chapter_01.md: 31 pages combined
Created chapter_02.md: 62 pages combined
Created chapter_03.md: 48 pages combined
Created chapter_04.md: 49 pages combined
Created chapter_05.md: 30 pages combined
Created chapter_06.md: 41 pages combined
Created chapter_07.md: 53 pages combined
Created chapter_08.md: 34 pages combined
Created chapter_09.md: 34 pages combined
Created chapter_10.md: 48 pages combined
Created chapter_11.md: 50 pages combined
Created chapter_12.md: 39 pages combined
Created chapter_13.md: 48 pages combined
Created chapter_14.md: 45 pages combined
Created chapter_15.md: 54 pages combined
Created chapter_16.md: 48 pages combined
Created chapter_17.md: 48 pages combined
Created chapter_18.md: 43 pages combined
Created chapter_19.md: 39 pages combined
Created chapter_20.md: 46 pages combined
Created chapter_21.md: 49 pages combined
Created chapter_22.md: 48 pages combined
Created chapter_23.md: 59 pages combined
Created chapter_24.md: 34 pages combined
Created chapter_

Processing chapters: 100%|██████████| 34/34 [00:00<00:00, 172.87it/s]

Created chapter_33.md: 35 pages combined
Created chapter_34.md: 34 pages combined

SUMMARY
Total chapters processed: 34
Total pages combined: 1513



