In [31]:
import PyPDF2
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTAnno, LTTextLine

def split_pdf_by_font_size(input_pdf, target_font_size):
    chunks = []
    start_page = None
    end_page = None
    skip_counter = 0  # Counter to skip the next 50 characters after recognizing the font size

    print(f"Processing {input_pdf} to split by font size: {target_font_size}...")

    with open(input_pdf, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        total_pages = len(reader.pages)
        
        for page_number, page_layout in enumerate(extract_pages(input_pdf)):
            print(f"Checking page {page_number + 1} of {total_pages}...")
            for element in page_layout:
                if isinstance(element, LTTextContainer):
                    for text_line in element:
                        if isinstance(text_line, LTChar):
                            character = text_line
                            if skip_counter > 0:
                                skip_counter -= 1
                                continue
                            if character.size == target_font_size:
                                if start_page is None:
                                    start_page = page_number
                                    print(f"Starting new chunk at page {page_number + 1}...")
                                    skip_counter = 50
                                else:
                                    end_page = page_number
                                    print(f"Ending chunk at page {page_number + 1}...")
                                    chunks.append((start_page, end_page))
                                    start_page = end_page = None
                        elif isinstance(text_line, LTTextLine):
                            for character in text_line:
                                if skip_counter > 0:
                                    skip_counter -= 1
                                    continue
                                if isinstance(character, LTChar) and character.size == target_font_size:
                                    if start_page is None:
                                        start_page = page_number
                                        print(f"Starting new chunk at page {page_number + 1}...")
                                        skip_counter = 50
                                    else:
                                        end_page = page_number
                                        print(f"Ending chunk at page {page_number + 1}...")
                                        chunks.append((start_page, end_page))
                                        start_page = end_page = None

        if start_page is not None and end_page is None:
            chunks.append((start_page, total_pages-1))
            
        # Create split PDFs based on chunks
        for index, (start, end) in enumerate(chunks):
            writer = PyPDF2.PdfWriter()
            print(f"Saving chunk {index + 1} (pages {start + 1} to {end + 1}) to 'chunk_{start}_{end}.pdf'...")
            for i in range(start, end+1):
                writer.add_page(reader.pages[i])
            with open(f'chunk_{start}_{end}.pdf', 'wb') as output:
                writer.write(output)

    print("Process complete!")

input_pdf_path = '2023-24-ugrad.pdf'
target_size = 22
split_pdf_by_font_size(input_pdf_path, target_size)

Processing 2023-24-ugrad.pdf to split by font size: 22...
Checking page 1 of 1296...
Checking page 2 of 1296...
Starting new chunk at page 2...
Checking page 3 of 1296...
Checking page 4 of 1296...
Checking page 5 of 1296...
Checking page 6 of 1296...
Checking page 7 of 1296...
Ending chunk at page 7...
Starting new chunk at page 7...
Checking page 8 of 1296...
Ending chunk at page 8...
Starting new chunk at page 8...
Checking page 9 of 1296...
Checking page 10 of 1296...
Checking page 11 of 1296...
Ending chunk at page 11...
Starting new chunk at page 11...
Checking page 12 of 1296...
Checking page 13 of 1296...
Checking page 14 of 1296...
Ending chunk at page 14...
Starting new chunk at page 14...
Checking page 15 of 1296...
Ending chunk at page 15...
Starting new chunk at page 15...
Checking page 16 of 1296...
Ending chunk at page 16...
Starting new chunk at page 16...
Checking page 17 of 1296...
Checking page 18 of 1296...
Ending chunk at page 18...
Starting new chunk at page 18...

In [3]:
import PyPDF2

# List of filenames
file_names = [
    'chunk_1_6.pdf', 'chunk_6_7.pdf', 'chunk_7_10.pdf', 'chunk_10_13.pdf', 
    'chunk_13_14.pdf', 'chunk_14_15.pdf', 'chunk_15_17.pdf', 'chunk_17_19.pdf', 
    'chunk_19_20.pdf', 'chunk_20_29.pdf', 'chunk_29_30.pdf', 'chunk_30_1032.pdf', 
    'chunk_1032_1240.pdf', 'chunk_1240_1244.pdf', 'chunk_1244_1245.pdf', 
    'chunk_1245_1251.pdf', 'chunk_1251_1252.pdf', 'chunk_1252_1254.pdf', 
    'chunk_1254_1255.pdf', 'chunk_1255_1256.pdf', 'chunk_1256_1264.pdf', 
    'chunk_1264_1270.pdf', 'chunk_1270_1276.pdf', 'chunk_1276_1278.pdf', 
    'chunk_1278_1280.pdf', 'chunk_1280_1286.pdf', 'chunk_1286_1287.pdf', 
    'chunk_1287_1290.pdf', 'chunk_1290_1291.pdf', 'chunk_1291_1295.pdf'
]

for file_name in file_names:
    with open(file_name, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        writer = PyPDF2.PdfWriter()
        
        # Add all pages except the last one to the writer
        for page_num in range(len(reader.pages) - 1):
            writer.add_page(reader.pages[page_num])
        
        # Save the modified PDF back with the same name
        with open(file_name, 'wb') as output_file:
            writer.write(output_file)

print("Last pages removed from all files successfully!")


ModuleNotFoundError: No module named 'PyPDF2'

In [209]:
def extract_and_mark_text_by_font_size(pdf_path, target_font_size, tolerance=0.5):
    """
    Extracts all instances of text with a specified font size from a PDF document 
    and marks each instance with its font size.

    Args:
    - pdf_path (str): Path to the PDF document.
    - target_font_size (float): The font size to search for.
    - tolerance (float): The allowed deviation from the target font size.

    Returns:
    - List of text instances with the specified font size, each marked with its font size.
    """
    doc = fitz.open(pdf_path)
    marked_texts = []

    for page in doc:
        # Extract text from the page as a list of characters
        chars = page.get_text("dict")["blocks"]
        current_text = ""
        for block in chars:
            for line in block["lines"]:
                for span in line["spans"]:
                    if abs(span['size'] - target_font_size) <= tolerance:
                        current_text += span['text'] + ' '
                    elif current_text:
                        marked_texts.append(f"({current_text.strip()})[{target_font_size}]")
                        current_text = ''
            if current_text:  # Ensure that we capture any text that might be at the end
                marked_texts.append(f"({current_text.strip()})[{target_font_size}]")
                current_text = ''

    return marked_texts

# Usage
pdf_paths = ["chunk_1_6.pdf", "chunk_7_10.pdf", "chunk_10_13.pdf", "chunk_13_14.pdf", "chunk_14_15.pdf", "chunk_15_17.pdf", "chunk_17_19.pdf", "chunk_19_20.pdf",
             "chunk_20_29.pdf", "chunk_29_30.pdf", "chunk_1240_1244.pdf", "chunk_1244_1245.pdf", "chunk_1245_1251.pdf", "chunk_1251_1252.pdf", "chunk_1252_1254.pdf", "chunk_1254_1255.pdf", 
             "chunk_1255_1256.pdf", "chunk_1256_1264.pdf", "chunk_1264_1270.pdf", "chunk_1270_1276.pdf", "chunk_1276_1278.pdf", "chunk_1278_1280.pdf", "chunk_1280_1286.pdf", "chunk_1286_1287.pdf", "chunk_1287_1290.pdf",
             "chunk_1290_1291.pdf", "chunk_1291_1295.pdf"]
target_size = 16.0
variables_titles = {}
for pdf_path in pdf_paths:
    variables_titles[f'texts_{pdf_path}'] = extract_and_mark_text_by_font_size(pdf_path, target_size)

# pdf_path = "chunk_30_1032.pdf"
# target_size = 18.0
# results_23rtgf = extract_and_mark_text_by_font_size(pdf_path, target_size)
# results_23rtgf


In [210]:
variables_titles

{'texts_chunk_1_6.pdf': [],
 'texts_chunk_7_10.pdf': ['(The University Catalog)[16.0]',
  '(Accreditation)[16.0]',
  '(Mission Statement: The University of North Carolina at Chapel Hill)[16.0]',
  '(UNC’s Commitment to Diversity and Inclusivity)[16.0]',
  '(Policy on Prohibited Discrimination, Harassment and Related Misconduct Including Sexual and Gender-Based Harassment, Sexual Violence, Interpersonal Violence and Stalking ( https://policies.unc.edu/TDClient/2833/ Portal/Shared/Search/?c=all&s=Policy +on+Prohibited+Discrimination%2C +Harassment+and+Related+Misconduct ))[16.0]',
  '(Policy Statement on Nondiscrimination: Educational and Employment Decisions)[16.0]',
  '(Resources for Information and Assistance)[16.0]',
  '(Reporting Options)[16.0]',
  '(Conﬁdential Resources)[16.0]',
  '(Graduation Rate)[16.0]'],
 'texts_chunk_10_13.pdf': ['(Ofﬁce of the Chancellor)[16.0]',
  '(Ofﬁce of the Provost)[16.0]',
  '(College of Arts and Sciences)[16.0]',
  '(Ofﬁce of Undergraduate Education)

In [211]:
import fitz

def extract_and_mark_text_between_markers_finalized(pdf_path, target_font_size, tolerance=0.5):
    """
    Extracts all the text between two consecutive instances of a target font size and then 
    concatenates instances of different font sizes within a chunk. The entire concatenated
    string is then added as a single item in the chunks list.

    Args:
    - pdf_path (str): Path to the PDF document.
    - target_font_size (float): The font size to search for as a marker.
    - tolerance (float): The allowed deviation from the target font size.

    Returns:
    - List of concatenated chunks of text between markers.
    """
    doc = fitz.open(pdf_path)
    all_marked_chunks = []
    
    # Step 1: Identify markers' positions
    markers_positions = []

    for page_num, page in enumerate(doc):
        chars = page.get_text("dict")["blocks"]
        for block in chars:
            for line in block["lines"]:
                for span in line["spans"]:
                    if abs(span['size'] - target_font_size) <= tolerance:
                        markers_positions.append((page_num, span['bbox']))
    
    # Add a dummy end marker to capture the last chunk
    markers_positions.append((len(doc) - 1, (float('inf'), float('inf'), float('inf'), float('inf'))))
    
    # Step 2: Extract text between these markers
    for i in range(len(markers_positions) - 1):
        start_pos = markers_positions[i]
        end_pos = markers_positions[i + 1]
        
        combined_chunk = ""
        current_chunk = ""
        current_size = None
        capture = False
        
        for page_num, page in enumerate(doc):
            # If the page number is not between the start and end markers, continue
            if page_num < start_pos[0] or page_num > end_pos[0]:
                continue
            
            chars = page.get_text("dict")["blocks"]
            for block in chars:
                for line in block["lines"]:
                    for span in line["spans"]:
                        # Check if the current span is the start or end marker
                        if tuple(span['bbox']) == start_pos[1]:
                            capture = True
                            continue
                        if tuple(span['bbox']) == end_pos[1]:
                            capture = False
                        
                        # If text is between two markers
                        if capture:
                            if current_size is None:  # First span after a marker
                                current_size = span['size']
                            elif current_size != span['size']:  # Change in font size
                                combined_chunk += f"({current_chunk.strip()})[{current_size}]"
                                current_chunk = ""
                                current_size = span['size']
                            current_chunk += span['text'] + ' '
        
        if current_chunk:
            combined_chunk += f"({current_chunk.strip()})[{current_size}]"
        if combined_chunk:
            all_marked_chunks.append(combined_chunk)

    return all_marked_chunks

# Test the finalized implementation on the provided PDF
variables = {}
target_size = 16.0
for pdf_path in pdf_paths:
    variables[f'marked_chunks_finalized_{pdf_path}'] = extract_and_mark_text_between_markers_finalized(pdf_path, target_size)

# pdf_path = "chunk_30_1032.pdf"
# target_size = 18.0
# results_23rtg = extract_and_mark_text_between_markers_finalized(pdf_path, target_size)
# results_23rtg



In [206]:
# combined_list = []
# for i in range(0,len(results_23rtg)):
#     combined_instance = f"{results_23rtgf[i]}\n{results_23rtg[i]}"
#     combined_list.append(combined_instance)



In [212]:
combined_dict = {}

# Loop through all pdf paths
for pdf_path in pdf_paths:
    combined_list = []
    
    # Fetch titles for the current PDF
    titles = variables_titles.get(f'texts_{pdf_path}', [])
    # Fetch text between the titles for the current PDF
    texts = variables.get(f'marked_chunks_finalized_{pdf_path}', [])
    
    # Ensure the length of titles and texts match
    min_length = min(len(titles), len(texts))
    
    # Merge titles with corresponding text
    for i in range(min_length):
        combined_instance = f"{titles[i]}\n{texts[i]}"
        combined_list.append(combined_instance)
    
    # Store the combined list in the dictionary with the pdf_path as the key
    combined_dict[pdf_path] = combined_list

# Now you can access the combined list for any specific PDF using its path:
# Example: 
print(combined_dict["chunk_7_10.pdf"])


['(The University Catalog)[16.0]\n(Although the publisher of this catalog has made every reasonable effort to attain factual accuracy herein, no responsibility is assumed for editorial or clerical errors or errors occasioned by mistakes. The publisher has attempted to present information which, at the time of preparation for publication, most accurately describes the course offerings, faculty listings, policies, procedures, regulations, and requirements of the University. However, it does not establish contractual relationships. The University reserves the right to alter or change any statement contained herein without prior notice. Published by the University of North Carolina at Chapel Hill, Chapel Hill, N.C.)[8.0]', '(Accreditation)[16.0]\n(The University of North Carolina at Chapel Hill is accredited by the Southern Association of Colleges and Schools Commission on Colleges (SACSCOC) to award baccalaureate, masters, educational specialist, and doctorate degrees. Degree-granting ins

In [213]:
combined_dict = {key: val for key, val in combined_dict.items() if val}

In [207]:
import re
from collections import defaultdict
import json

pattern = r"\((.*?)\)\[([\d.]+)\]"

def create_hierarchy(matches):
    hierarchy = defaultdict(list)
    current_level1 = None
    current_level2 = None
    current_key = None

    for content, font_size in matches:
        if font_size == "18.0":
            current_level1 = content
            hierarchy[current_level1] = []
            current_level2 = None
            current_key = None
        elif font_size == "16.0":
            if not current_level1:
                current_level1 = content
                hierarchy[current_level1] = []
            else:
                current_level2 = content
                hierarchy[current_level1].append({current_level2: []})
            current_key = None
        elif font_size == "12.0":
            current_key = content
            if current_level2:
                hierarchy[current_level1][-1][current_level2].append({current_key: ""})
            else:
                hierarchy[current_level1].append({current_key: ""})
        elif font_size in ["8.0", "6.400000095367432", "7.0"]:
            if current_key:
                if current_level2:
                    hierarchy[current_level1][-1][current_level2][-1][current_key] += content
                else:
                    hierarchy[current_level1][-1][current_key] += content
            else:
                if current_level2:
                    hierarchy[current_level1][-1][current_level2].append(content)
                else:
                    hierarchy[current_level1].append(content)
                    
    return hierarchy

def process_text_list(text_list):
    all_matches = []
    for text_block in text_list:
        matches = re.findall(pattern, text_block, re.DOTALL)
        all_matches.extend(matches)
    
    # Create hierarchy from the combined matches
    hierarchy = create_hierarchy(all_matches)
    return hierarchy

# hierarchy_from_list = process_text_list(combined_list)


# json_output = json.dumps(hierarchy_from_list, indent=4)

# file_path = "chunk_30_1032.json"

# with open(file_path, 'w') as json_file:
#     json.dump(hierarchy_from_list, json_file, indent=4)



In [216]:
def process_pdfs_to_json(pdf_paths, pdf_data_dict):
    """
    Process the text chunks from the provided PDF paths and save the resulting hierarchies as JSON.
    
    Args:
    - pdf_paths (list): List of paths to the PDF files.
    - pdf_data_dict (dict): Dictionary with PDF names (without .pdf extension) as keys and lists of text chunks as values.

    Returns:
    - None
    """
    # Process each PDF's data
    for pdf_path in pdf_paths:
        pdf_name = pdf_path.rstrip(".pdf")
        
        # If this PDF name is in the dictionary, process its text chunks
        if pdf_name in pdf_data_dict:
            hierarchy = process_text_list(pdf_data_dict[pdf_name])
            
            # Convert to JSON and save to file
            json_output = json.dumps(hierarchy, indent=4)
            json_file_path = pdf_name + ".json"
            with open(json_file_path, 'w') as json_file:
                json.dump(hierarchy, json_file, indent=4)

# Usage example (assuming you've populated pdf_data with text chunks):
process_pdfs_to_json(pdf_paths, combined_dict)





In [208]:
import json
import ftfy

def clean_json_data(data):
    """Recursively clean the JSON data using ftfy."""
    if isinstance(data, str):
        return ftfy.fix_text(data)
    elif isinstance(data, list):
        return [clean_json_data(item) for item in data]
    elif isinstance(data, dict):
        return {key: clean_json_data(value) for key, value in data.items()}
    else:
        return data

def clean_json_file_with_ftfy(file_path):
    # Load the JSON file
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Recursively clean the data
    cleaned_data = clean_json_data(data)

    # Save the cleaned data back to the JSON file
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(cleaned_data, f, ensure_ascii=False, indent=4)

# Path to your output.json file
file_path = "chunk_30_1032.json"
clean_json_file_with_ftfy(file_path)


In [150]:
from reportlab.lib.pagesizes import letter, landscape
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.units import inch

def create_pdf_from_text(text, filename):
    # Create a new PDF with the specified filename
    doc = SimpleDocTemplate(filename, pagesize=landscape(letter))
    
    # Define the styles for the title (font size 18) and the body (font size 8)
    styles = getSampleStyleSheet()
    title_style = ParagraphStyle(
        "TitleStyle",
        parent=styles["Heading1"],
        fontSize=18,
        spaceAfter=12,
    )
    body_style = ParagraphStyle(
        "BodyStyle",
        parent=styles["BodyText"],
        fontSize=8,
    )
    
    # Split the text by lines and create a Paragraph for the title and the body
    lines = text.split("\n")
    title = Paragraph(lines[0], title_style)
    body = [Paragraph(line, body_style) for line in lines[1:]]
    
    # Build the PDF
    elements = [title] + body
    doc.build(elements)

# For each chunk in combined_list, create a PDF
for idx, chunk in enumerate(combined_list):
    create_pdf_from_text(chunk, f"undergraduate_chunk_{idx}.pdf")


In [12]:
# Directory to save the files
directory = "txt_files_"

for idx, chunk in enumerate(combined_list):
    # Create a filename for each chunk
    filename = f"chunk_{idx}.txt"
    filepath = directory + filename
    
    with open(filepath, "w") as f:
        lines = chunk.split("\n")
        # Format the first line (e.g., make it uppercase to represent larger font size)
        f.write(lines[0].upper() + "\n")
        # Write the rest of the lines
        f.writelines("\n".join(lines[1:]) + "\n\n")
