In [1]:
import pdfplumber
import pandas as pd
from pypdf import PdfReader

# extract tables:
def extract_text_and_tables(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        all_data = []
        
        for page in pdf.pages:
            text = page.extract_text()
            tables = page.extract_tables()

            if text:
                # Table settings.
                ts = {
                    "vertical_strategy": "lines",
                    "horizontal_strategy": "lines",
                }

                # Get the bounding boxes of the tables on the page.
                bboxes = [table.bbox for table in page.find_tables(table_settings=ts)]

                def not_within_bboxes(obj):
                    # """Check if the object is in any of the table's bbox."""
                    def obj_in_bbox(_bbox):
                        v_mid = (obj["top"] + obj["bottom"]) / 2
                        h_mid = (obj["x0"] + obj["x1"]) / 2
                        x0, top, x1, bottom = _bbox
                        return (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
                    return not any(obj_in_bbox(__bbox) for __bbox in bboxes)
                
                text = page.filter(not_within_bboxes).extract_text()
                
                all_data.append({'page_number': page.page_number, 'text': text})

            if tables:
                for table in tables:
                    all_data.append({'page_number': page.page_number, 'table': table})

    return all_data

#code to extract images
def extract_images(pdf_file):
    reader = PdfReader(pdf_file)
    for i in range(len(reader.pages)):
        page = reader.pages[i]
        if page:
            for im in page.images:
                with open(im.name, "wb") as f:
                    f.write(im.data)

# read PDF file
pdf_file = "CSMVol2_PaxService (1).pdf"
data = extract_text_and_tables(pdf_file)

images = extract_images(pdf_file)

# Create Vecor store
for entry in data:
    page_number = entry['page_number']
    if 'text' in entry:
        print(f"Text from page {page_number}:\n{entry['text']}\n")
    if 'table' in entry:
        print(f"Table from page {page_number}:\n")
        df = pd.DataFrame(entry['table'][1:], columns=entry['table'][0])
        df = df.dropna(axis=1, how='all')
        df = df.reset_index(drop=True)
        print(df.head(1))


Text from page 1:
Alaska Airlines
Horizon Air(cid:13)
Customer Service Manual
Volume 2 - Customer Services
Printed: 10/6/2023

Text from page 2:
The Customer Service Manual is an electronic manual. This is a printed version of the electronic manual.
The page numbers in this printed version are for reference only, and have no correlation in the electronic
version.
Use the path shown at the beginning of each topic to reference the location of the information within the
electronic version.

Text from page 3:
Contents
Table of Contents
Volume 2 - Passenger Service 1
History. .o..f.. .C..h..a..n..g..e..s. ............................................................................................................... 1
Sect.i.o..n.. .1. .-.. .In..t..r.o..d..u..c..t.i.o..n.. .................................................................................................... 100
100 - Passenger Service... .C...S..M.... .V..o...l.u..m....e.. ......................................................

Empty DataFrame
Columns: []
Index: []
Text from page 872:
Customer Service Manual
· Add this information in the Flight STAR, Remarks area.
· Make sure to include any special service information.
Notify the following if necessary:
Make a final diversion announcement on the aircraft.
Add any additional information to the Flight STAR.
Inbound Flight
Verify the Flight STAR for pertinent information, including SSRs.
Notify the following if necessary:
· Order bus transportation. Enter the name and telephone number of the bus company in the Flight
STAR, Remarks.
Example: Greyhound (206) 555-1212, SJC-
SFO
· The meeting agent shall make an onboard announcement stating where the baggage can be
claimed and the bus location.
· Verify all baggage has been claimed prior to the bus’ departure for the destination city.
· An agent must be present at the bus for boarding.
· Update the Flight STAR mask with the total number of busses and the departure time of the last
bus.
· Add any additional informati

In [None]:
extract_images(pdf_file)