## Final Code

In [None]:
#final code
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time

def scrapeWestchester():
    
    # Send a GET request to the URL
    url = "https://www.westchestermedicalcenter.org/anesthesiology-residency-program"
    response = requests.get(url)
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find the div containing the faculty information
    faculty_div = soup.find("div", {"id": "cpsys_DynamicTab_20cd7230-3562-4f30-a2f9-54b2b3ddb29f_4"})
    
    # Initialize lists to store first names and last names
    first_names = []
    last_names = []
    email = [" "]
    error = [" "]
    
    # Extract faculty names from the div and split last names
    for strong_tag in faculty_div.find_all("strong"):
        full_name = strong_tag.text.strip()
        if "," in full_name:
            # Split the full name into first name and last name
            last_name, first_name = full_name.split(",")[0].strip(), full_name.split(",")[1].strip()
            # Split the modified last name into first name and last name
            modified_last_name_parts = last_name.split()
            if len(modified_last_name_parts) > 1:
                last_name = modified_last_name_parts[-1]
            if len(first_name) > 1:
                first_name = " ".join(modified_last_name_parts[:-1])
            first_names.append(first_name)
            last_names.append(last_name)
            
    
    # Create a pandas DataFrame to store the data
    data = {"First Name": first_names, "Last Name": last_names}
    faculty_df = pd.DataFrame(data)

    #returning the data frame
    return faculty_df

def scrapeUpstate():
    
    #Initialize Chrome webdriver
    driver = webdriver.Chrome()
    
    # URL of the page
    url = 'https://www.upstate.edu/anesthesiology/about-us/faculty.php'
    
    # Open the URL in the browser
    driver.get(url)
    
    # Get the page source
    html_content = driver.page_source
    
    # Close the browser
    driver.quit()

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the div containing faculty information
    faculty_section = soup.find('div',class_= "col-md-9")
    
    # Initialize a list to store faculty names
    faculty_names = []
    
    # Extract faculty names from different categories
    categories = set([h2.text for h2 in faculty_section.find_all('h2')])
    for category in categories:
        category_title = faculty_section.find('h2', text=category)
        if category_title:
            # Check if the next sibling is a div element
            next_sibling = category_title.find_next_sibling()
            if next_sibling and next_sibling.name == 'div':
                faculty_links = next_sibling.find_all('a')
                for link in faculty_links:
                    name_parts = link.text.strip().split()
                    if len(name_parts) >= 2:
                        first_name = name_parts[0]
                        last_name = name_parts[1].replace(',', '')  # Remove comma from last name
                        faculty_names.append({'First Name': name_parts[0], 'Last Name': name_parts[1]}) 
                        
    
    # Remove duplicate names
    faculty_names = [dict(t) for t in {tuple(d.items()) for d in faculty_names}]
    
    # Create a DataFrame to store the extracted names
    faculty_df = pd.DataFrame(faculty_names)

    #returning the DataFrame
    return faculty_df

def scrapeNewMexico():
    
    #Initialize Chrome driver
    driver = webdriver.Chrome()
    
    # Load the webpage
    url = "https://hsc.unm.edu/directory/"
    driver.get(url)
    
    # Find all elements with the specified classes
    elements = driver.find_elements(By.CSS_SELECTOR, ".col-sm-6.col-md-4.col-lg-3.dirItem.blockImage.all.somanesthesiology")
    
    # Initialize lists to store first names and last names
    first_names = []
    last_names = []
    
    # Extract first names and last names
    for element in elements:
        first_name = element.get_attribute("data-first-name")
        last_name = element.get_attribute("data-last-name")
        first_names.append(first_name)
        last_names.append(last_name)
    
    # Close the driver
    driver.quit()
    
    # Create a pandas DataFrame to store the data
    data = {"First Name": first_names, "Last Name": last_names}
    df = pd.DataFrame(data)
    
    #return the DataFrame
    return df


def main():

    df_westchester = scrapeWestchester()
    df_upstate = scrapeUpstate()
    df_mexico = scrapeNewMexico()

    # Create empty DataFrames with columns "First Name", "Last Name", "Email", and "Error"
    empty_data = {"First Name": [], "Last Name": [], "Email": [], "Error": []}
    empty_df = pd.DataFrame(empty_data)

    # Concatenate empty DataFrames with scraped DataFrames
    df_westchester = pd.concat([df_westchester, empty_df])
    df_upstate = pd.concat([df_upstate, empty_df])
    df_mexico = pd.concat([df_mexico, empty_df])

    # Use ExcelWriter to write in different sheets of the main excel file 
    with pd.ExcelWriter("scraped_data_summed.xlsx") as writer:
       
        # Use to_excel function and specify the sheet_name and index 
        # to store the dataframe in the specified sheet
        df_westchester.to_excel(writer, sheet_name="Westchester Medical Center", index=False)
        df_upstate.to_excel(writer, sheet_name="Upstate Medical University", index=False)
        df_mexico.to_excel(writer, sheet_name="University of New Mexico", index=False)

main()
