In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Send a GET request to the website
url = "https://www.nyc.gov/site/doh/services/sexual-health-clinics.page"
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Initialize an empty list to store the data
    data = []
    
    # Find all the clinic names and their associated information
    collapsible_articles = soup.find_all('div', class_='collapsible-articles')
    collapsible_contents = soup.find_all('div', class_='collapsible-contents')
    
    # Loop through the matched elements
    for article, content in zip(collapsible_articles, collapsible_contents):
        # Extract the clinic name
        clinic_name = article.find('p').get_text(strip=True)
        
        # Extract the clinic information
        clinic_info = content.get_text(separator='\n', strip=True)
        
        # Append the data as a list
        data.append([clinic_name, clinic_info])
    
    # Convert the list into a DataFrame
    df = pd.DataFrame(data, columns=['Clinic Name', 'Information'])
    
    # Display the DataFrame
    print(df)
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

                            Clinic Name  \
0            Service Times and Closures   
1                          Telemedicine   
2              Clinic Services Provided   
3       Morrisania Sexual Health Clinic   
4          Jamaica Sexual Health Clinic   
5           Corona Sexual Health Clinic   
6      Fort Greene Sexual Health Clinic   
7            Fort Greene Express Clinic   
8          Chelsea Sexual Health Clinic   
9                Chelsea Express Clinic   
10  Central Harlem Sexual Health Clinic   
11                       Closed Clinics   
12                         Test Results   
13         Health Insurance and Billing   
14              Medical Records Request   

                                          Information  
0   Regular Hours\nSee below for hours for each cl...  
1   For faster service, you can speak to a NYC Sex...  
2   The following services are available on a walk...  
3   1309 Fulton Avenue, Second Floor, Bronx\nMonda...  
4   90-37 Parsons Boulevard, Fi

In [28]:
df = df.drop([0, 1, 2])

In [29]:
clinic_names = ['Morrisania Sexual Health Clinic', 'Jamaica Sexual Health Clinic', 'Corona Sexual Health Clinic', 'Fort Greene Sexual Health Clinic', 'Fort Greene Express Clinic', 'Chelsea Sexual Health Clinic', 'Chelsea Express Clinic', 'Central Harlem Sexual Health Clinic']
df_filtered = df[df['Clinic Name'].isin(clinic_names)]
df_filtered

Unnamed: 0,Clinic Name,Information
3,Morrisania Sexual Health Clinic,"1309 Fulton Avenue, Second Floor, Bronx\nMonda..."
4,Jamaica Sexual Health Clinic,"90-37 Parsons Boulevard, First Floor, Queens\n..."
5,Corona Sexual Health Clinic,"34-33 Junction Boulevard, First Floor, Queens\..."
6,Fort Greene Sexual Health Clinic,"295 Flatbush Avenue Extension, Second Floor, B..."
7,Fort Greene Express Clinic,"295 Flatbush Avenue Extension, First Floor, Br..."
8,Chelsea Sexual Health Clinic,"303 9th Avenue, First Floor, Manhattan\nMonday..."
9,Chelsea Express Clinic,"303 9th Avenue, First Floor, Manhattan\nMonday..."
10,Central Harlem Sexual Health Clinic,"2238 Fifth Avenue, First Floor, Manhattan\nMon..."


In [30]:
df_filtered['Address'] = df_filtered['Information'].str.split('\n').str[0]
df_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Address'] = df_filtered['Information'].str.split('\n').str[0]


Unnamed: 0,Clinic Name,Information,Address
3,Morrisania Sexual Health Clinic,"1309 Fulton Avenue, Second Floor, Bronx\nMonda...","1309 Fulton Avenue, Second Floor, Bronx"
4,Jamaica Sexual Health Clinic,"90-37 Parsons Boulevard, First Floor, Queens\n...","90-37 Parsons Boulevard, First Floor, Queens"
5,Corona Sexual Health Clinic,"34-33 Junction Boulevard, First Floor, Queens\...","34-33 Junction Boulevard, First Floor, Queens"
6,Fort Greene Sexual Health Clinic,"295 Flatbush Avenue Extension, Second Floor, B...","295 Flatbush Avenue Extension, Second Floor, B..."
7,Fort Greene Express Clinic,"295 Flatbush Avenue Extension, First Floor, Br...","295 Flatbush Avenue Extension, First Floor, Br..."
8,Chelsea Sexual Health Clinic,"303 9th Avenue, First Floor, Manhattan\nMonday...","303 9th Avenue, First Floor, Manhattan"
9,Chelsea Express Clinic,"303 9th Avenue, First Floor, Manhattan\nMonday...","303 9th Avenue, First Floor, Manhattan"
10,Central Harlem Sexual Health Clinic,"2238 Fifth Avenue, First Floor, Manhattan\nMon...","2238 Fifth Avenue, First Floor, Manhattan"


In [31]:
def remove_address(info):
    # Split by the first newline character and return the rest
    return info.split('\n', 1)[1] if '\n' in info else info

# Apply the function to the Information column
df_filtered['Information'] = df_filtered['Information'].apply(remove_address)

# Display the cleaned DataFrame
df_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Information'] = df_filtered['Information'].apply(remove_address)


Unnamed: 0,Clinic Name,Information,Address
3,Morrisania Sexual Health Clinic,"Monday through Friday, 8:30 a.m. to 3:30 p.m.\...","1309 Fulton Avenue, Second Floor, Bronx"
4,Jamaica Sexual Health Clinic,"Monday through Friday, 8:30 a.m. to 3:30 p.m.\...","90-37 Parsons Boulevard, First Floor, Queens"
5,Corona Sexual Health Clinic,"Monday through Friday, 8:30 a.m. to 3:30 p.m.\...","34-33 Junction Boulevard, First Floor, Queens"
6,Fort Greene Sexual Health Clinic,"Monday through Friday, 8:30 a.m. to 3:30 p.m.\...","295 Flatbush Avenue Extension, Second Floor, B..."
7,Fort Greene Express Clinic,"Monday through Friday, 8:30 a.m. to 3:30 p.m.\...","295 Flatbush Avenue Extension, First Floor, Br..."
8,Chelsea Sexual Health Clinic,"Monday to Friday, 8:30 a.m. to 3:30 p.m.\nThe ...","303 9th Avenue, First Floor, Manhattan"
9,Chelsea Express Clinic,"Monday through Friday, 8:30 a.m. to 3:30 p.m.\...","303 9th Avenue, First Floor, Manhattan"
10,Central Harlem Sexual Health Clinic,"Monday through Friday, 8:30 a.m. to 3:30 p.m.\...","2238 Fifth Avenue, First Floor, Manhattan"


In [32]:
# Function to split Information and extract times
def split_information(info):
    # Split by the first newline character
    parts = info.split('\n', 1)
    # Return a tuple: (first part, rest of the string or empty string)
    return parts[0], parts[1] if len(parts) > 1 else ''

# Apply the function and create two new columns
df_filtered[['Information', 'Info']] = df_filtered['Information'].apply(split_information).apply(pd.Series)

# Display the updated DataFrame
print(df_filtered)

                            Clinic Name  \
3       Morrisania Sexual Health Clinic   
4          Jamaica Sexual Health Clinic   
5           Corona Sexual Health Clinic   
6      Fort Greene Sexual Health Clinic   
7            Fort Greene Express Clinic   
8          Chelsea Sexual Health Clinic   
9                Chelsea Express Clinic   
10  Central Harlem Sexual Health Clinic   

                                      Information  \
3   Monday through Friday, 8:30 a.m. to 3:30 p.m.   
4   Monday through Friday, 8:30 a.m. to 3:30 p.m.   
5   Monday through Friday, 8:30 a.m. to 3:30 p.m.   
6   Monday through Friday, 8:30 a.m. to 3:30 p.m.   
7   Monday through Friday, 8:30 a.m. to 3:30 p.m.   
8        Monday to Friday, 8:30 a.m. to 3:30 p.m.   
9   Monday through Friday, 8:30 a.m. to 3:30 p.m.   
10  Monday through Friday, 8:30 a.m. to 3:30 p.m.   

                                              Address  \
3             1309 Fulton Avenue, Second Floor, Bronx   
4        90-37 Parso

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[['Information', 'Info']] = df_filtered['Information'].apply(split_information).apply(pd.Series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[['Information', 'Info']] = df_filtered['Information'].apply(split_information).apply(pd.Series)


In [33]:
df_filtered = df_filtered.rename(columns={'Information': 'Times'})
df_filtered

Unnamed: 0,Clinic Name,Times,Address,Info
3,Morrisania Sexual Health Clinic,"Monday through Friday, 8:30 a.m. to 3:30 p.m.","1309 Fulton Avenue, Second Floor, Bronx",The clinic may close early once capacity is re...
4,Jamaica Sexual Health Clinic,"Monday through Friday, 8:30 a.m. to 3:30 p.m.","90-37 Parsons Boulevard, First Floor, Queens",The clinic may close early once capacity is re...
5,Corona Sexual Health Clinic,"Monday through Friday, 8:30 a.m. to 3:30 p.m.","34-33 Junction Boulevard, First Floor, Queens",The clinic may close early once capacity is re...
6,Fort Greene Sexual Health Clinic,"Monday through Friday, 8:30 a.m. to 3:30 p.m.","295 Flatbush Avenue Extension, Second Floor, B...",The clinic may close early once capacity is re...
7,Fort Greene Express Clinic,"Monday through Friday, 8:30 a.m. to 3:30 p.m.","295 Flatbush Avenue Extension, First Floor, Br...",Also open Tuesdays from 5 p.m. to 7 p.m.\nThe ...
8,Chelsea Sexual Health Clinic,"Monday to Friday, 8:30 a.m. to 3:30 p.m.","303 9th Avenue, First Floor, Manhattan",The clinic may close earlier if capacity is re...
9,Chelsea Express Clinic,"Monday through Friday, 8:30 a.m. to 3:30 p.m.","303 9th Avenue, First Floor, Manhattan",Also open Tuesdays from 5 p.m. to 7 p.m. for S...
10,Central Harlem Sexual Health Clinic,"Monday through Friday, 8:30 a.m. to 3:30 p.m.","2238 Fifth Avenue, First Floor, Manhattan",The clinic may close early once capacity is re...


In [34]:
df_filtered.to_csv('health.csv', index=False)