In [1]:
!pip install pypdf2
!pip install pdf2image
!pip install pytesseract
!pip install tqdm
!pip install tabula-py
!apt install ghostscript python3-tk
!pip install opencv-python
!pip install pdfplumber
!pip install --upgrade pymupdf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ghostscript is already the newest version (9.55.0~dfsg1-0ubuntu5.10).
python3-tk is already the newest version (3.10.8-1~22.04).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [2]:
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import pytesseract
import os
import json
from PIL import Image
import re
from tqdm import tqdm
from IPython.display import display
import pandas as pd
import shutil

In [3]:
# Load the PDF
pdf_path = 'Understanding-NEPA-Litigation_v4.pdf'
reader = PdfReader(pdf_path)

In [4]:
print("no. of pages = ", len(reader.pages))

no. of pages =  20


In [5]:
raw_text = ""
page_text = {}
c = 0
for page in tqdm(reader.pages):
    raw_text += page.extract_text() + "\n ---------- NEW PAGE ---------- \n"
    page_text[c+1] = page.extract_text()
    c+=1

100%|██████████| 20/20 [00:00<00:00, 26.16it/s]


### Just check the content

In [6]:
print(page_text[3])

3
Key findings:  
•  Between 2013 and 2022, circuit courts heard approximately 39 NEPA appeals cases per year,  
a 56% increase over the rate from 2001 to 2015 .2
•  Agencies won about 80% of the 2013-2022 appeals cases, 11% more per year than from 2001 
to 2004 , 8% more than from 2001 to 2008 , and 4% less than from 2009 to 2015 .3 The rate at 
which agencies’ reviews are upheld is high, meaning these environmental reviews are seldom 
changed as a result of litigation.
•  On average, 4.2 years elapsed between publication of an environmental impact statement  
or environmental assessment and conclusion of the corresponding legal challenge at the 
appellate level. Of these appealed cases, 84% were closed less than six years after the contested 
permit was published, and 39% were closed in less than three.
•  Among the challenges, 42% contested environmental impact statements, and 36% contested 
environmental assessments. Agencies won about 80% of challenges to both.
•  NGOs instigated 

In [7]:
cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,!?@#$%^&*()+=:;\'"-]', '', raw_text)

In [8]:
clean_pagewise_text = {}
for i in page_text:
    clean_pagewise_text[i] = re.sub(r'[^a-zA-Z0-9\s.,!?@#$%^&*()+=:;\'"-]', '', page_text[i])

In [9]:
print(clean_pagewise_text[3])

3
Key findings:  
  Between 2013 and 2022, circuit courts heard approximately 39 NEPA appeals cases per year,  
a 56% increase over the rate from 2001 to 2015 .2
  Agencies won about 80% of the 2013-2022 appeals cases, 11% more per year than from 2001 
to 2004 , 8% more than from 2001 to 2008 , and 4% less than from 2009 to 2015 .3 The rate at 
which agencies reviews are upheld is high, meaning these environmental reviews are seldom 
changed as a result of litigation.
  On average, 4.2 years elapsed between publication of an environmental impact statement  
or environmental assessment and conclusion of the corresponding legal challenge at the 
appellate level. Of these appealed cases, 84% were closed less than six years after the contested 
permit was published, and 39% were closed in less than three.
  Among the challenges, 42% contested environmental impact statements, and 36% contested 
environmental assessments. Agencies won about 80% of challenges to both.
  NGOs instigated 72% of

In [10]:
base_dir = '/Processed_NEPA_Litigation'
os.makedirs(base_dir, exist_ok=True)

In [11]:
text_dir = os.path.join(base_dir, 'Text')
if os.path.exists(text_dir):
  shutil.rmtree(text_dir)
os.makedirs(text_dir, exist_ok=True)

In [12]:
file_path = os.path.join(text_dir, 'full_text.txt')
with open(file_path, 'w') as file:
    file.write(cleaned_text)

In [13]:
for i in clean_pagewise_text:
    file_path = os.path.join(text_dir, f'page_{i}.txt')
    with open(file_path, 'w') as file:
        file.write(clean_pagewise_text[i])

### Tables

In [14]:
from tabula import read_pdf

In [15]:
tables_new = {}
for i in tqdm(range(len(reader.pages))):
    tables = read_pdf(pdf_path, pages=i+1,
                      multiple_tables=True, stream=True)
    tables_new[i+1] = []
    if len(tables) > 0:
      for df in tqdm(tables):
          # Check if the first column's first row is NaN
          if pd.isna(df.iloc[1, 0]):
              # Create new column headers
              new_columns = []
              for col, (val1, val2) in zip(df.columns, zip(df.iloc[0], df.iloc[1])):
                  # Include original column name if not "Unnamed"
                  col_header = (col if "Unnamed" not in col else "") + \
                              (str(val1) if not pd.isna(val1) else "") + \
                              (str(val2) if not pd.isna(val2) else "")
                  new_columns.append(col_header.strip())

              # Set the new column headers and drop the first two rows
              df.columns = new_columns
              df = df.iloc[2:].reset_index(drop=True)

              tables_new[i+1].append(df)
          else:
              tables_new[i+1].append(df)

 25%|██▌       | 5/20 [00:25<01:05,  4.38s/it]
100%|██████████| 2/2 [00:00<00:00, 5577.53it/s]
 30%|███       | 6/20 [00:28<00:54,  3.91s/it]
100%|██████████| 1/1 [00:00<00:00, 1919.59it/s]
 35%|███▌      | 7/20 [00:31<00:48,  3.70s/it]
100%|██████████| 1/1 [00:00<00:00, 2631.31it/s]
 45%|████▌     | 9/20 [00:38<00:38,  3.53s/it]
100%|██████████| 2/2 [00:00<00:00, 1362.45it/s]
 50%|█████     | 10/20 [00:41<00:33,  3.38s/it]
100%|██████████| 1/1 [00:00<00:00, 299.36it/s]
 60%|██████    | 12/20 [00:46<00:23,  2.93s/it]
100%|██████████| 1/1 [00:00<00:00, 1210.13it/s]
 70%|███████   | 14/20 [00:52<00:18,  3.03s/it]
100%|██████████| 1/1 [00:00<00:00, 797.40it/s]
100%|██████████| 20/20 [01:07<00:00,  3.39s/it]


In [16]:
display(tables_new[6][0])

Unnamed: 0,Project category,Number of cases,Minimum days,Maximum days,Average days,Median days
0,Energy,70,110,5032,1415,1159
1,Infrastructure,45,91,3456,1250,1127
2,Other,37,210,3648,1531,1511
3,Public lands,106,98,6942,1744,1486
4,,,Minimum,Maximum,Average,Median
5,Total categories in days,–,91,6942,1538,1365
6,Total categories in years,–,0.2,19.0,4.2,3.7


In [17]:
tables_dir = os.path.join(base_dir, 'Tables')
if os.path.exists(tables_dir):
    shutil.rmtree(tables_dir)
os.makedirs(tables_dir, exist_ok=True)

In [18]:
c = 0
for i in range(len(tables_new)):
    if len(tables_new[i+1]) == 0:
        continue
    for j in range(len(tables_new[i+1])):
      file_path = os.path.join(tables_dir, f'page_{i+1}_table_{c+1}.csv')
      c+=1
      tables_new[i+1][j].to_csv(file_path, index=False)

### Images

In [19]:
import pymupdf

In [20]:
pdf_document = pymupdf.open(pdf_path)
image_count = 0

In [21]:
images_dir = os.path.join(base_dir, 'Images')

if os.path.exists(images_dir):
    shutil.rmtree(images_dir)
os.makedirs(images_dir, exist_ok=True)

In [22]:
# Iterate through each page
for page_number in range(len(pdf_document)):
    page = pdf_document[page_number]
    images = page.get_images(full=True)

    for img_index, img in enumerate(images):
        xref = img[0]  # Get the image reference
        base_image = pdf_document.extract_image(xref)

        # Get image data
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]

        # Save the image
        image_filename = f"page_{page_number+1}_img{img_index+1}.{image_ext}"
        file_path = os.path.join(images_dir, image_filename)
        with open(file_path, "wb") as image_file:
            image_file.write(image_bytes)

### PDF metadata

In [23]:
# Define the folder paths
tables_folder = tables_dir
images_folder = images_dir
text_folder = text_dir

# Initialize metadata dictionary
metadata = {}
metadata['PDF_Name'] = pdf_path
metadata['Page_Count'] = len(reader.pages)

# Extract tables
page_tables = {}
for filename in os.listdir(tables_folder):
    match = re.match(r"page_(\d+)_table_\d+\.csv", filename)
    if match:
        page_number = int(match.group(1))
        page_tables.setdefault(page_number, []).append(filename)

# Extract images
page_images = {}
for filename in os.listdir(images_folder):
    match = re.match(r"page_(\d+)_img\d+\.(jpeg|jpg)", filename)
    if match:
        page_number = int(match.group(1))
        page_images.setdefault(page_number, []).append(filename)

# Extract text files and link elements
for filename in os.listdir(text_folder):
    match = re.match(r"page_(\d+)\.txt", filename)
    if match:
        page_number = int(match.group(1))
        linked_tables = page_tables.get(page_number, [])
        linked_images = page_images.get(page_number, [])

        metadata[f"Page {page_number}"] = {
            "Text": filename,
            "Tables": linked_tables,
            "Images": linked_images
        }

# Print metadata
import json
print(json.dumps(metadata, indent=2))


{
  "PDF_Name": "Understanding-NEPA-Litigation_v4.pdf",
  "Page_Count": 20,
  "Page 15": {
    "Text": "page_15.txt",
    "Tables": [
      "page_15_table_9.csv"
    ],
    "Images": []
  },
  "Page 4": {
    "Text": "page_4.txt",
    "Tables": [],
    "Images": []
  },
  "Page 16": {
    "Text": "page_16.txt",
    "Tables": [],
    "Images": []
  },
  "Page 20": {
    "Text": "page_20.txt",
    "Tables": [],
    "Images": []
  },
  "Page 12": {
    "Text": "page_12.txt",
    "Tables": [],
    "Images": []
  },
  "Page 19": {
    "Text": "page_19.txt",
    "Tables": [],
    "Images": []
  },
  "Page 7": {
    "Text": "page_7.txt",
    "Tables": [
      "page_7_table_3.csv"
    ],
    "Images": []
  },
  "Page 9": {
    "Text": "page_9.txt",
    "Tables": [],
    "Images": []
  },
  "Page 11": {
    "Text": "page_11.txt",
    "Tables": [
      "page_11_table_7.csv"
    ],
    "Images": []
  },
  "Page 13": {
    "Text": "page_13.txt",
    "Tables": [
      "page_13_table_8.csv"
    ],
 

In [24]:

metadata_dir = os.path.join(base_dir, 'Metadata')
if os.path.exists(metadata_dir):
    shutil.rmtree(metadata_dir)
os.makedirs(metadata_dir, exist_ok=True)

file_name = "metadata.json"
file_path = os.path.join(metadata_dir, file_name)

In [25]:
with open(file_path, "w") as json_file:
    json.dump(metadata, json_file, indent=4)