In [1]:

# Search All Numbered PDFs for "length:" Context and Compare Numbers (2-word context, no progress bar)

# This cell will:
# - Search all PDFs in the Downloads folder with a filename format like 180535.pdf (number.pdf only).
# - For each PDF, find all occurrences of "length:" and extract the 2 words before and after.
# - Extract numbers from each match.
# - Compare the numbers in match 1 and match 2 for each PDF.
# - Print status updates for visual feedback.
# - Output a DataFrame with project number, match details, and useful summary columns.

import os
import re
import PyPDF2
import pandas as pd

# Directory containing the PDFs
pdf_dir = r"C:\Users\clint\Desktop\RA Task\Downloads"

# Regex to match files like 180535.pdf (number.pdf only)
pdf_pattern = re.compile(r"^\d+\.pdf$")

# Function to extract numbers from a string
def extract_numbers(s):
    return re.findall(r"\d+(?:\.\d+)?", s)

# Store results
results = []

# Only include files that match the number.pdf pattern
pdf_files = [f for f in os.listdir(pdf_dir) if pdf_pattern.match(f)]

print(f"Selected {len(pdf_files)} PDF files to process (number.pdf format only).\n")

for filename in pdf_files:
    print(f"\nProcessing file: {filename}")
    pdf_path = os.path.join(pdf_dir, filename)
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + " "
            words = text.split()
            matches = []
            for i, word in enumerate(words):
                if word.lower().startswith("length:"):
                    start = max(i - 2, 0)
                    end = min(i + 3, len(words))
                    context = " ".join(words[start:end])
                    matches.append(context)
            # Extract numbers from each match
            match_numbers = [extract_numbers(m) for m in matches]
            # Compare numbers in match 1 and match 2
            comparison = None
            if len(match_numbers) >= 2:
                comparison = match_numbers[0] == match_numbers[1]
            results.append({
                "filename": filename,
                "matches": matches,
                "match_numbers": match_numbers,
                "comparison": comparison
            })
            print(f"  Found {len(matches)} 'length:' matches.")
    except Exception as e:
        print(f"Error reading {filename}: {e}")

# Print results
print("\nSummary of Results:\n" + "="*40)
for res in results:
    print(f"File: {res['filename']}")
    for idx, (match, nums) in enumerate(zip(res['matches'], res['match_numbers']), 1):
        print(f"  Match {idx}: {match}")
        print(f"    Numbers: {nums}")
    if res['comparison'] is not None:
        print(f"  Match 1 and Match 2 numbers are the same: {res['comparison']}")
    print("-" * 40)

print(f"\nProcessed {len(pdf_files)} files. {sum(1 for r in results if len(r['matches']) >= 2)} files had at least two 'length:' matches.")

# Prepare data for DataFrame
rows = []
for res in results:
    project_number = res['filename'].split('.')[0]
    match_numbers = res['match_numbers']
    # Flatten matches for columns: match_1, match_2, ...
    match_cols = {}
    max_matches = max(len(match_numbers), 2)  # At least 2 columns for consistency
    for i in range(max_matches):
        col_name = f'match_{i+1}'
        match_cols[col_name] = match_numbers[i] if i < len(match_numbers) else []
    # Check if all matches are the same (if there are at least 2 matches)
    all_same = False
    if len(match_numbers) >= 2:
        all_same = all(match_numbers[0] == m for m in match_numbers[1:])
    # Add number of matches and number of numbers found in each match
    row = {
        'project_number': project_number,
        'num_matches': len(match_numbers),
        **match_cols,
        'all_matches_same': all_same,
        'comparison_1_2': (match_numbers[0] == match_numbers[1]) if len(match_numbers) >= 2 else None,
        'numbers_in_match_1': len(match_numbers[0]) if len(match_numbers) > 0 else 0,
        'numbers_in_match_2': len(match_numbers[1]) if len(match_numbers) > 1 else 0
    }
    rows.append(row)

# Create DataFrame
pdf_df = pd.DataFrame(rows)

print("\nDataFrame of extracted length numbers and comparisons:")
display(pdf_df)

Selected 202 PDF files to process (number.pdf format only).


Processing file: 170561.pdf
  Found 2 'length:' matches.

Processing file: 170626.pdf
  Found 2 'length:' matches.

Processing file: 170634.pdf
  Found 2 'length:' matches.

Processing file: 174015.pdf
  Found 2 'length:' matches.

Processing file: 174016.pdf
  Found 2 'length:' matches.

Processing file: 180003.pdf
  Found 2 'length:' matches.

Processing file: 180004.pdf
  Found 2 'length:' matches.

Processing file: 180006.pdf
  Found 2 'length:' matches.

Processing file: 180012.pdf
  Found 2 'length:' matches.

Processing file: 180014.pdf
  Found 2 'length:' matches.

Processing file: 180020.pdf
  Found 2 'length:' matches.

Processing file: 180024.pdf
  Found 2 'length:' matches.

Processing file: 180025.pdf
  Found 2 'length:' matches.

Processing file: 180028.pdf
  Found 2 'length:' matches.

Processing file: 180029.pdf
  Found 2 'length:' matches.

Processing file: 180031.pdf
  Found 2 'length:' matches.

Processing

Unnamed: 0,project_number,num_matches,match_1,match_2,all_matches_same,comparison_1_2,numbers_in_match_1,numbers_in_match_2
0,170561,2,[10.15],[10.15],True,True,1,1
1,170626,2,[],[],True,True,0,0
2,170634,2,[13.93],[13.93],True,True,1,1
3,174015,2,[5.45],[5.45],True,True,1,1
4,174016,2,[1.22],[1.22],True,True,1,1
...,...,...,...,...,...,...,...,...
197,180611,2,[],[],True,True,0,0
198,180621,2,[13.06],[13.06],True,True,1,1
199,180622,2,[24.61],[24.58],False,False,1,1
200,184002,2,[1.67],[1.67],True,True,1,1


We analyze whether Project Length and Work Length have different properties. We see that they are different; therefore, we should consider whether we will use project length or work length.

In [2]:
# Show all rows in pdf_df where 'all_matches_same' is False
pdf_df[pdf_df['all_matches_same'] == False]

Unnamed: 0,project_number,num_matches,match_1,match_2,all_matches_same,comparison_1_2,numbers_in_match_1,numbers_in_match_2
5,180003,2,[9.19],[9.13],False,False,1,1
30,180090,2,[8.85],[8.76],False,False,1,1
49,180139,2,[4.56],[4.18],False,False,1,1
82,180232,2,[1.19],[1.20],False,False,1,1
117,180385,2,[2.85],[2.76],False,False,1,1
150,180475,2,[18.97],[18.92],False,False,1,1
156,180490,2,[1.18],[1.25],False,False,1,1
162,180509,2,[12.47],[12.31],False,False,1,1
165,180520,2,[19.40],[19.20],False,False,1,1
167,180522,2,[16.98],[12.27],False,False,1,1


In [3]:
# Save the pdf_df DataFrame as a CSV file
pdf_df.to_csv("pdf_length_comparison.csv", index=False)
print("Saved pdf_df to pdf_length_comparison.csv")

Saved pdf_df to pdf_length_comparison.csv


We may be confused whether to use project length or work length, but given the definition is mileage: length of the project, we use project length.

In [9]:
# Extract numbers associated with 'Project Length: NA Miles' or 'Project Length: NA MI Miles' from each PDF
import os
import re
import PyPDF2
import pandas as pd

pdf_dir = r"C:\Users\clint\Desktop\RA Task\Downloads"
pdf_pattern = re.compile(r"^\d+\.pdf$")

project_length_results = []
pdf_files = [f for f in os.listdir(pdf_dir) if pdf_pattern.match(f)]

for filename in pdf_files:
    pdf_path = os.path.join(pdf_dir, filename)
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + " "
            
            # Improved regex to find 'Project Length: NA Miles' or 'Project Length: NA MI Miles'
            # This pattern handles:
            # - "Project Length: 7.21 Miles"
            # - "Project Length: 7.21 MI Miles"
            # - Various whitespace combinations
            matches = re.findall(r"Project Length:\s*([\d\.]+)\s*(?:MI\s+)?Miles", text, re.IGNORECASE)
            
            project_length_results.append({
                "filename": filename,
                "project_lengths": matches
            })
    except Exception as e:
        project_length_results.append({
            "filename": filename,
            "project_lengths": [],
            "error": str(e)
        })

# Create DataFrame
project_length_df = pd.DataFrame(project_length_results)
print("Project Lengths extracted from PDFs (handling both 'Miles' and 'MI Miles' formats):")
display(project_length_df)

Project Lengths extracted from PDFs (handling both 'Miles' and 'MI Miles' formats):


Unnamed: 0,filename,project_lengths
0,170561.pdf,[10.15]
1,170626.pdf,[]
2,170634.pdf,[13.93]
3,174015.pdf,[5.45]
4,174016.pdf,[1.22]
...,...,...
206,180622.pdf,[24.61]
207,184002.pdf,[1.67]
208,187023.pdf,[17.59]
209,188000.pdf,[5.45]


In [10]:
project_length_df.to_csv("project_length_extracted.csv", index=False)
print("Saved project_length_df to project_length_extracted.csv")

Saved project_length_df to project_length_extracted.csv


In [8]:
# Test the improved regex pattern with sample text containing both formats
import re

# Test strings with different formats
test_texts = [
    "Project Length: 7.21 Miles",
    "Project Length: 7.21 MI Miles", 
    "Project Length:    5.45   Miles",
    "Project Length:   3.22 MI    Miles",
    "project length: 10.15 mi miles",  # Test case-insensitive
    "Work Length: 2.50 Miles",  # Should not match
    "Project Length: 15.33 KM"   # Should not match
]

# The improved regex pattern
pattern = r"Project Length:\s*([\d\.]+)\s*(?:MI\s+)?Miles"

print("Testing improved regex pattern:")
print("Pattern:", pattern)
print("\nTest results:")
for text in test_texts:
    matches = re.findall(pattern, text, re.IGNORECASE)
    print(f"Text: '{text}' -> Matches: {matches}")

print("\nThe pattern successfully captures numbers from both 'Miles' and 'MI Miles' formats!")

Testing improved regex pattern:
Pattern: Project Length:\s*([\d\.]+)\s*(?:MI\s+)?Miles

Test results:
Text: 'Project Length: 7.21 Miles' -> Matches: ['7.21']
Text: 'Project Length: 7.21 MI Miles' -> Matches: ['7.21']
Text: 'Project Length:    5.45   Miles' -> Matches: ['5.45']
Text: 'Project Length:   3.22 MI    Miles' -> Matches: ['3.22']
Text: 'project length: 10.15 mi miles' -> Matches: ['10.15']
Text: 'Work Length: 2.50 Miles' -> Matches: []
Text: 'Project Length: 15.33 KM' -> Matches: []

The pattern successfully captures numbers from both 'Miles' and 'MI Miles' formats!


In [None]:

# Save the updated DataFrame
project_length_df.to_csv("project_length_extracted_improved.csv", index=False)
print("\nSaved improved project_length_df to project_length_extracted_improved.csv")