In [13]:
import re
import PyPDF2
import pandas as pd

# Function to extract text from the PDF
def extract_text_from_pdf(pdf_file):
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to clean and structure the extracted text
def extract_clinic_data(text):
    # Split the text into entries based on the clinic numbering pattern
    entries = re.split(r'\d+\)\s+', text)
    
    clinics = []
    
    for entry in entries:
        lines = entry.strip().split('\n')
        if len(lines) < 2:
            continue

        # Extract clinic name (first line)
        clinic_name = lines[0].strip()

        # Initialize variables
        address_lines = []
        phone = ''
        city_state_zip = ''

        for line in lines[1:]:
            line = line.strip()
            # Attempt to find a phone number
            phone_match = re.search(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', line)
            if phone_match:
                phone = phone_match.group()
            # Attempt to find city, state, and ZIP code (contains "NY")
            elif 'NY' in line:
                city_state_zip = line
            else:
                address_lines.append(line)
        
        address = ', '.join(address_lines).replace(' ,', ',')
        
        clinics.append({
            'Clinic Name': clinic_name,
            'Address': address,
            'City, State, Zip': city_state_zip,
            'Phone': phone
        })

    return clinics

# Path to the PDF file
pdf_file = 'dental-ma.pdf'

# Extract text from the PDF
text = extract_text_from_pdf(pdf_file)

# Extract structured clinic data
clinic_data = extract_clinic_data(text)

# Convert to a DataFrame
df_ma = pd.DataFrame(clinic_data)

# Drop the first row of the DataFrame
df_ma = df_ma.drop(index=0)

# Display the DataFrame
print(df_ma)

                                          Clinic Name  \
1                              Betances Health Center   
2                       Lower East Side Health Center   
3                                    Ryan Health NENA   
4            New York University College of Dentistry   
5    NYU Dentistry Oral Health Center for People with   
6      The Institute for Family Health at 17th Street   
7                          Gotham Health - Gouverneur   
8             Charles B. Wang Community Health Center   
9   Bellevue Department of Oral and Maxillofacial ...   
10      Bellevue Department of Oral and Maxillofacial   
11  New York Presbyterian/Weill Cornell Dental Pra...   
12  New York Presbyterian/Weill Cornell Dental Pra...   
13             Mount Sinai St. Luke’s - Dental Clinic   
14                       Ryan Health West 97th Street   
15    Children’s Aid Society - Dunlevy Milbank Center   
16                               Harlem Health Center   
17                        Renai

In [14]:
# Add a "Borough" column and set it to "Manhattan" for all rows
df_ma['Borough'] = 'Manhattan'

In [15]:
df_ma.columns

Index(['Clinic Name', 'Address', 'City, State, Zip', 'Phone', 'Borough'], dtype='object')

In [16]:
# Path to the PDF file
pdf_file = 'dental-bk.pdf'

# Extract text from the PDF
text = extract_text_from_pdf(pdf_file)

# Extract structured clinic data
clinic_data = extract_clinic_data(text)

# Convert to a DataFrame
df_bk = pd.DataFrame(clinic_data)

# Add a "Borough" column and set it to "Brooklyn" for all rows
df_bk['Borough'] = 'Brooklyn'

# Display the DataFrame
print(df_bk)

                                          Clinic Name  \
0                          DENT AL CLINICS – BROOKLYN   
1        The Brooklyn Hospital Center - Dentistry and   
2           NYU Langone Flatbush Family Health Center   
3                 Kings County Hospital Dental Clinic   
4                    HealthCare Choices - Bensonhurst   
5                     Cumberland Gotham Health Center   
6    Whitman Ingersoll Farragut Health Center of BPMC   
7     Woodhull Medical Center Department of Dentistry   
8                          Williamsburg Health Center   
9        Healthcare Choices - Community Health Center   
10       Brookdale Family Care Center New Lots Avenue   
11    Brownsville Multi -Service Family Health Center   
12                      Gotham Health - East New York   
13             MediSys Health Network - East New York   
14                        East New York Health Center   
15        Brookdale University Hospital Dental Clinic   
16       Brownsville Multi -Ser

In [17]:
# Drop the first row of the DataFrame
df_bk = df_bk.drop(index=0)

In [18]:
# Path to the PDF file
pdf_file = 'dental-bx.pdf'

# Extract text from the PDF
text = extract_text_from_pdf(pdf_file)

# Extract structured clinic data
clinic_data = extract_clinic_data(text)

# Convert to a DataFrame
df_bx = pd.DataFrame(clinic_data)

# Add a "Borough" column and set it to "Bronx" for all rows
df_bx['Borough'] = 'Bronx'

# Display the DataFrame
print(df_bx)

                                          Clinic Name  \
0                             DENTAL CLINICS –  BRONX   
1             161st Street Dental Clinic (Montefiore)   
2             BronxCare Ogden Family Medical & Dental   
3                           Sun River Health - Inwood   
4        Morrisania Diagnostic and Treatment Center *   
5   BronxCare Dr. Martin Luther King, Jr. Health C...   
6            Morris Heights Health Center at Burnside   
7       Morris Heights Health Center at Walton Avenue   
8                       Union Community Health Center   
9     Walton Family Health Practice Dental Department   
10                 Third Avenue Fa mily Health Center   
11             Adapt Community Network - 137th Street   
12  BronxCare Dr. Martin Luther King, Jr. Health C...   
13      Union Community Health Center Dental Services   
14                  BronxCare Medical & Dental at Poe   
15                      Union Community Health Center   
16     951 Prospect Avenue Dent

In [19]:
# Drop the first row of the DataFrame
df_bx = df_bx.drop(index=0)

In [20]:
# Path to the PDF file
pdf_file = 'dental-qu.pdf'

# Extract text from the PDF
text = extract_text_from_pdf(pdf_file)

# Extract structured clinic data
clinic_data = extract_clinic_data(text)

# Convert to a DataFrame
df_qu = pd.DataFrame(clinic_data)

# Add a "Borough" column and set it to "Bronx" for all rows
df_qu['Borough'] = 'Queens'

# Display the DataFrame
print(df_qu)

                                          Clinic Name  \
0                            DENT AL CLINICS – QUEENS   
1                   Long Island Jewish Medical Center   
2                  The Floating Hospital: Main Clinic   
3    Flushing Hospital Medical Center - Dental Clinic   
4    New York -Presbyterian Queens - Dental Emergency   
5   New York Presbyterian  Queens - Center for Dental   
6   Urban Health Pl an- Plaza Del Sol Family Healt...   
7                      Apicha Community Health Center   
8         MediSys  Family Care Center - Hollis Tudors   
9     MediSys Health Network - Ozone Park (Clocktower   
10           MediSys - Jamaica Hospital Dental Clinic   
11        Firehouse Health Center (Damian Family Care   
12                        Health and Hospitals Queens   
13                MediSys Health Network - St. Albans   
14              Joseph P Addabbo Family Health Center   
15              Joseph P Addabbo Family Health Center   
16                          Dam

In [21]:
# Drop the first row of the DataFrame
df_qu = df_qu.drop(index=0)

In [22]:
# Path to the PDF file
pdf_file = 'dental-si.pdf'

# Extract text from the PDF
text = extract_text_from_pdf(pdf_file)

# Extract structured clinic data
clinic_data = extract_clinic_data(text)

# Convert to a DataFrame
df_si = pd.DataFrame(clinic_data)

# Add a "Borough" column and set it to "Bronx" for all rows
df_si['Borough'] = 'Staten Island'

# Display the DataFrame
print(df_si)

                                       Clinic Name  \
0                  DENT AL CLINICS – STATEN ISLAND   
1                    Sun River Health - Bay Street   
2              Community Health Center of Richmond   
3    Metro Community Health Center - Staten Island   
4  Community Health Center of Richmond Stapleton -   
5      Staten Island University Hospital Northwell   

                                             Address  \
0  , August  2023,, Listed below are dental clini...   
1                                          57 Bay St   
2                              439 Port Richmond Ave   
3                                    2324 Forest Ave   
4                St. George, 135 Canal St, Suite 200   
5                                    475 Seaview Ave   

           City, State, Zip         Phone        Borough  
0                                          Staten Island  
1   Staten Island, NY 10301  855-681-8700  Staten Island  
2   Staten Island, NY 10302  917-830-0838  Staten I

In [23]:
# Drop the first row of the DataFrame
df_si = df_si.drop(index=0)

Jam it all together and save it

In [25]:
# List all the DataFrames
dataframes = [df_si, df_bx, df_ma, df_qu, df_bk]

# Concatenate all the DataFrames into one
df = pd.concat(dataframes, ignore_index=True)

# Display the combined DataFrame
print(df)

df.to_csv('dental.csv', index=False)

                                           Clinic Name  \
0                        Sun River Health - Bay Street   
1                  Community Health Center of Richmond   
2        Metro Community Health Center - Staten Island   
3      Community Health Center of Richmond Stapleton -   
4          Staten Island University Hospital Northwell   
..                                                 ...   
108  Sunset Terrace Family Health Center, NYU Lango...   
109                            Adapt Community Network   
110             Joseph P. Addabbo Family Health Center   
111       Wycoff Heights Medical Center, Department of   
112      ODA Primary Health Care Network Dental Center   

                                 Address          City, State, Zip  \
0                              57 Bay St   Staten Island, NY 10301   
1                  439 Port Richmond Ave   Staten Island, NY 10302   
2                        2324 Forest Ave  Staten Island, NY 1030 3   
3    St. George, 135 Ca

In [26]:
import pandas as pd

# Set the display options to show all rows
pd.set_option('display.max_rows', None)

# Display the DataFrame
df

Unnamed: 0,Clinic Name,Address,"City, State, Zip",Phone,Borough
0,Sun River Health - Bay Street,57 Bay St,"Staten Island, NY 10301",855-681-8700,Staten Island
1,Community Health Center of Richmond,439 Port Richmond Ave,"Staten Island, NY 10302",917-830-0838,Staten Island
2,Metro Community Health Center - Staten Island,2324 Forest Ave,"Staten Island, NY 1030 3",718-447-0200,Staten Island
3,Community Health Center of Richmond Stapleton -,"St. George, 135 Canal St, Suite 200","Staten Island, NY 10304",917-830-1950,Staten Island
4,Staten Island University Hospital Northwell,475 Seaview Ave,"Staten Island, NY 10305",718-226-9080,Staten Island
5,161st Street Dental Clinic (Montefiore),"305 East 161st St, Lower Level","Bronx, NY 10451",718-579-2535,Bronx
6,BronxCare Ogden Family Medical & Dental,1067 Ogden Ave,,718-466-3222,Bronx
7,Sun River Health - Inwood,1543- 45 Inwood Ave,"Bronx, NY 10452",855-681-8700,Bronx
8,Morrisania Diagnostic and Treatment Center *,"1225 Gerard Ave, 3rd Floor","Bronx, NY 10452",718-960-2911,Bronx
9,"BronxCare Dr. Martin Luther King, Jr. Health C...",1775 Grand Concourse,,718-901-8400,Bronx
