In [39]:
import pandas as pd
import requests
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time

In [None]:
#---------------scrapes washington state bar members contact info ------------------------------
# create an empty list to store the LegalProfile URLs
legal_profile_urls = []

# specify the range of pages to scrape there are 1734 pages
start_page = 1
end_page = 3

# loop through the pages in the specified range
for page_number in range(start_page, end_page+1):
    # make a request to the current page
    url = f'https://www.mywsba.org/PersonifyEbusiness/LegalDirectory.aspx?ShowSearchResults=TRUE&EligibleToPractice=Y&Country=USA&Page={page_number}'
    response = requests.get(url)
    
    # create a BeautifulSoup object from the response content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # find all the <tr> elements with class "grid-row"
    rows = soup.find_all('tr', {'class': 'grid-row'})
    
    # loop through the rows and extract the LegalProfile URL from the onclick attribute
    for row in rows:
        onclick = row.get('onclick')
        if onclick:
            url = onclick.split("'")[1]
            legal_profile_urls.append(f"https://www.mywsba.org/PersonifyEbusiness/"+str(url))

    print(page_number)

# initialize empty list to store scraped data
data = []

# loop through each URL and scrape the data
for url in legal_profile_urls:
    # make a GET request to the URL
    response = requests.get(url)


    # parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # find the relevant elements by ID and extract the text
    member_no = soup.find('span', {'id': 'dnn_ctr2977_DNNWebControlContainer_ctl00_lblMemberNo'}).text
    license_type = soup.find('span', {'id': 'dnn_ctr2977_DNNWebControlContainer_ctl00_lblLicenseType'}).text
    eligible_to_practice = soup.find('span', {'id': 'dnn_ctr2977_DNNWebControlContainer_ctl00_lblEligibleToPractice'}).text
    status = soup.find('span', {'id': 'dnn_ctr2977_DNNWebControlContainer_ctl00_lblStatus'}).text
    wa_admit_date = soup.find('span', {'id': 'dnn_ctr2977_DNNWebControlContainer_ctl00_lblWaAdmitDate'}).text
    email = soup.find('span', {'id': 'dnn_ctr2977_DNNWebControlContainer_ctl00_lblEmail'}).text
    member_name = soup.find('span', {'id': 'dnn_ctr2977_DNNWebControlContainer_ctl00_lblMemberName'}).text
    address = soup.find('span', {'id': 'dnn_ctr2977_DNNWebControlContainer_ctl00_lblAddress'}).text
    phone = soup.find('span', {'id': 'dnn_ctr2977_DNNWebControlContainer_ctl00_lblPhone'}).text
    area_of_practice = soup.find('span', {'id': 'dnn_ctr2977_DNNWebControlContainer_ctl00_lblPracticeAreas'}).text

    # append the data to the list
    data.append([member_name, member_no, license_type, eligible_to_practice, status, wa_admit_date, email, address, phone, area_of_practice])
    print(url)
    
# write the data to a CSV file
#with open('Washington_Bar_Association.csv', 'w', newline='') as file:
with open('test.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Full_Name','Member_No.', 'License_Type', 'Eligible_to_Practice', 'Status', 'WSBA_Admit_Date', 'Email','address','phone','legal_speciality'])
    writer.writerows(data)

In [None]:
#---------add a state registed column to the current Washington csv's-----

# Read the CSV file into a pandas dataframe
df = pd.read_csv('Washington_Bar_Association.csv')

# Add a new column called 'state_registered' and set all values to 'WA'
df['state_registered'] = 'WA'

# Write the updated dataframe to a new CSV file
df.to_csv('Washington_Bar_Association1.csv', index=False)

In [None]:
#read to csv
df = pd.read_csv('Washington_Bar_Association1.csv')

#rename columns
df = df.rename(columns={"Full_Name": "Name", "legal_speciality": "legal_specialty"}, errors="raise")

# formate birthday to a datetime
# convert the WSBA_Admit_Date column to a string
df['WSBA_Admit_Date'] = df['WSBA_Admit_Date'].astype(str)
# split the date string into month, day, and year
date_parts = df['WSBA_Admit_Date'].str.split('/', expand=True)
month = date_parts[0]
# add a '0' to the month if necessary
month = month.apply(lambda x: x.zfill(2))
# combine the month, day, and year back into a date string
df['WSBA_Admit_Date'] = month + '/' + date_parts[1] + '/' + date_parts[2]
# convert the "WSBA_Admit_Date" column to pandas datetime objects
df['WSBA_Admit_Date'] = pd.to_datetime(df['WSBA_Admit_Date'], format='%m/%d/%Y')
# reformat the "WSBA_Admit_Date" column to "dd-mm-yyyy" format
df['WSBA_Admit_Date'] = df['WSBA_Admit_Date'].dt.strftime('%d-%m-%Y')

# drop rows with no email
df = df.dropna(subset=['Email'])

#replace "" with '' in the name column to prevent sendy upload error
df['Name'] = df['Name'].str.replace('"', "'")

#rename columns
df = df.rename(columns={"Full_Name": "Name", "WSBA_Admit_Date": "bar_admittance_date","address": "mailing_address" })
df = df[['Name',"Email", "Status","bar_admittance_date", "mailing_address", "phone", "state_registered","License_Type","Member_No.", "legal_specialty"]]

display(df)
# Write the updated dataframe to a new CSV file
df.to_csv('Washington_Bar_Association1.csv', index=False)

In [None]:
#split washington_contact_csv into chunks to be uploaded into sendy
#split csv into chunks of 970 rows
chunk_size = 5000

# read the CSV file into a pandas dataframe
df = pd.read_csv('Washington_Bar_Association1.csv')

# split the dataframe into chunks of 970 rows
chunks = [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)]

# save each chunk to a separate CSV file
for i, chunk in enumerate(chunks):
    chunk.to_csv(f'./washington_contacts/Washington_Bar_Association_chunk_{i}.csv', index=False)

In [None]:
#---------------------scrapes idaho state bar members contact info-----------------------
# Launch a new Chrome browser instance
driver = webdriver.Chrome()

# Navigate to the website
url = 'https://apps.isb.idaho.gov/licensing/attorney_roster.cfm'
driver.get(url)

# Find the button using its class name
button = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[type="submit"]')))

# Click the button
button.click()

# Wait for the page to load
time.sleep(3)

# Get the page source after clicking the button
page_source = driver.page_source

# Close the browser
driver.quit()

#list of idaho lawyers profile urls
lawyer_profile_urls = []

# Scrape all hrefs from the table using Beautiful Soup
soup = BeautifulSoup(page_source, 'html.parser')
table = soup.find('table', {'class': 'table table-striped dataTable no-footer table-hover'})
links = table.find_all('a')
count = 0
for link in links:
    href = link.get('href')
    if href:
        lawyer_profile_urls.append('https://apps.isb.idaho.gov/licensing/' + str(href))
    count += 1
    print(f"added {count} lawyers profile urls to lawyer profile url list")

contact_info = []

count1 = 0
for url in lawyer_profile_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # full_name = soup.find('class', {'id': 'panel-title'}).text
    try:    
        data = {}

        # Extract the full name from the h4 attribute with class 'panel-title'
        full_name = soup.find('h4', {'class': 'panel-title'}).text.strip()
        data['full_name'] = full_name

        for dt in soup.find_all('dt'):
            key = dt.text.strip()
            if key == 'Mailing Address':
            # If the dt text is 'Mailing Address', use the text of the next 2 siblings
                value = dt.find_next_sibling('dd').text.strip() + ", " + dt.find_next_sibling('dd').find_next_sibling('dd').text.strip()
            else:
                value = dt.find_next_sibling('dd').text.strip()

            data[key] = value

        contact_info.append(data)

    except Exception as e:
        print(f"Error processing {url}: {e}")
    count1 += 1
    print(f"added {count1} lawyers contact information to contact info list")


with open('data.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(contact_info[0].keys())  # write header row
    for row in contact_info:
        writer.writerow(row.values())

    print('sucessfully wrote Idaho lawyer contact info to csv')

In [None]:
#---------add a 'state_registed column to the current Idaho csv's-----

# Read the CSV file into a pandas dataframe
df = pd.read_csv('data.csv')

# Add a new column called 'state_registered' and set all values to 'WA'
df['state_registered'] = 'ID'

# Write the updated dataframe to a new CSV file
df.to_csv('Idaho_Bar_Association1.csv', index=False)

In [None]:
#read to csv
df = pd.read_csv('Idaho_Bar_Association1.csv')

#rename columns
df = df.rename(columns={"full_name": "Name", "Admittance Date": "bar_admittance_date", "Firm": "firm","Mailing Address": "mailing_address", "Phone": "phone", "Phone Ext": "phone_ext", "Bar Email Address": "Email", "Website Address": "website_address", "Court eService Email": "court_eService_email"}, errors="raise")

#formate birthday to a datetime
# convert the "bar_admittance_date" column to pandas datetime objects
df['bar_admittance_date'] = pd.to_datetime(df['bar_admittance_date'], format='%m/%d/%Y')

# reformat the "bar_admittance_date" column to "dd-mm-yyyy" format
df['bar_admittance_date'] = df['bar_admittance_date'].dt.strftime('%d-%m-%Y')
df = df[['Name',"Email","Status","bar_admittance_date",'firm','mailing_address','phone','website_address','court_eService_email','state_registered','phone_ext']]

# Write the updated dataframe to a new CSV file
# df.to_csv('Idaho_Bar_Association1.csv', index=False)

display(df)

In [None]:
#change order of columns
df = pd.read_csv('Idaho_Bar_Association_contacts.csv') #source file deleted
df = df[['Name',"Email","Status","bar_admittance_date",'firm','mailing_address','phone','phone_ext','website_address','court_eService_email','state_registered']]
df = df.dropna(subset=['Email'])
df['Name'] = df['Name'].str.replace('"', "'")
df.to_csv('Idaho_Bar_Association_contacts1.csv', index=False)
display(df)

In [None]:
#split csv into chunks of 970 rows to be uploaded into sendy
chunk_size = 968

# read the CSV file into a pandas dataframe
df = pd.read_csv('Idaho_Bar_Association_contacts1.csv')

# split the dataframe into chunks of 970 rows
chunks = [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)]

# save each chunk to a separate CSV file
for i, chunk in enumerate(chunks):
    chunk.to_csv(f'./idaho_contacts/Idaho_Bar_Association_chunk_{i}.csv', index=False)

In [None]:
#---------------scrape state of Mississippi bar directory------------------------------
# Launch a new Chrome browser instance
driver = webdriver.Chrome()

# Navigate to the website
url = 'https://www.msbar.org/lawyer-directory.aspx?type=7&term=A&response=03AKH6MRH19wTOAKE1f1ufR2dIeNH3AWn3nk0p90DQVE1uNvqGKh7F3cwNrLUkk0wxqcg3p9NYERDDSavMoaorNFMWzNFHzQnQxYrzww9Qa3wpDbeq-rF4STgHT75NuZsq1rR9LsfavuG-4uymLQMxsSfUe3SnXhjr50fmDr9P1AxbTx7BT47Kxy6f93IUrfK9a2c9ptmu6LMKJLZ0jeGoIh4oydnyzpljghoG95Kju6fP6QXox-uYBhP9D4F-6BtkEKTXstXn__em503Hngv7ubN7DwjN5RlQjE7gri9NSyfQLxuRzzCWE55rkfElTs-itIEeku51-t7vABggRK-JE7BFwBzswtPZXD7lMJSpIHnNreFdh560cNP7ABCDghcmo2Fs4cqf_ABYY0U4fxWQU3p9opYFv1pfkxd8Nt88V8EzoRNy4RFjgTc0IpQA4ZtXEG-IckSd7XyFm1a0hPBd1wuQT-mHpPFppVMfsVW_BECn8jSVjgfr7FutCKtbnb-ar3mif20eTWk_yry70LGGhP9wehy_ZRvRDGwlpcb3iSxZMPGPTxHKdeEAKWxa9Gd4H5BQ6RzNlhQyluDUvR0idKlvm_3vO35Dfw'
driver.get(url)

# Find the button using its class name
button = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'searchbutton')))

# Click the button
button.click()

# Wait for the page to load
time.sleep(3)

# Get the page source after clicking the button
page_source = driver.page_source

# Close the browser
driver.quit()

# Parse the HTML content of the response using Beautiful Soup
soup = BeautifulSoup(page_source, 'html.parser')

# Find all the sections with class 'LawyerInformation cf'
sections = soup.find_all('section', {'class': 'LawyerInformation cf'})

# Initialize a list to store the dictionaries
results = []

# Loop through each section and extract the label-value pairs
for section in sections:
    data_dict = {}
    label_holders = section.find_all('div', {'class': 'LabelHolder'})
    data_holders = section.find_all('div', {'class': 'DataHolder'})
    for label_holder, data_holder in zip(label_holders, data_holders):
        label = label_holder.get_text(strip=True)
        data = data_holder.get_text(strip=True)
        data_dict[label] = data
    results.append(data_dict)

# Print the results
print(results)

In [None]:
url = "https://isba.reliaguide.com/lawyer/60601-IL-Amy-Richards-138552"
response = requests.get(url)

soup = BeautifulSoup(response.content, "html.parser")
link = soup.find("a", {"class": "gx-link"})


if link:
    href = link.get("href")
    tooltip_div = link.find_previous_sibling("div", {"class": "ant-tooltip-inner"})
    full_name_div = tooltip_div.find("div", {"class": "gx-text-center gx-mb-0"})
    full_name = full_name_div.find("p", {"class": "gx-mb-0 gx-text-none gx-fs-xl gx-font-weight-light"}).text.strip()

    print(f"Link: {href}")
    print(f"Full name: {full_name}")

print(link)

In [None]:
#-----------------currently bypasses recaptcha for maryland state bar but does not scrape due to ToS----------------

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Initialize the WebDriver and navigate to the website
driver = webdriver.Chrome()
driver.get("https://www.msba.org/about/member-directory/?fwp_member_directory_bar_admission_state=maryland")

# Wait for the reCaptcha checkbox to appear and click it
wait = WebDriverWait(driver, 10)
recaptcha_frame = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "iframe[title='reCAPTCHA']")))
driver.switch_to.frame(recaptcha_frame)
recaptcha_checkbox = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div.recaptcha-checkbox-border")))
ActionChains(driver).move_to_element(recaptcha_checkbox).click().perform()

# Switch back to the main frame and wait for the submit button to appear and click it
driver.switch_to.default_content()
submit_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button#submitBtn[type='submit']")))
submit_button.click()

# Close the WebDriver

In [None]:


#-----------------wisconsin------------------------
import requests
from bs4 import BeautifulSoup

# create an empty list to store the LegalProfile URLs
legal_profile_links = []

start_number = 0
end_number = 10

count = 0
# URL of the website to scrape
for page_num in range(start_number, end_number+1,10):
    url = f'https://www.wisbar.org/Pages/AdvancedLawyerSearch-Updated.aspx?refinementfilters=%27SBW-ProfileState%3a(%22wi%22)%27&state=wi&sourceid=%279f1b1dca-5f9d-406e-9067-6b5597699787%27&querytemplatepropertiesurl=%27spfile%3a%2f%2fwebroot%2fqueryparametertemplate.xml%27&selectproperties=%27path%2csbw-profilelastname%2csbw-profilefullname%2csbw-profilefirstname%2csbw-profilestate%2csbw-profilemiddlename%2csbw-profilecompany%2csbw-profilecity%2csbw-profileprefixname%27&StartRow={page_num}'

    # Send GET request to the website and parse the HTML using Beautiful Soup
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all div elements with class SearchResult
    search_results = soup.find_all('div', class_='SearchResult')

    # Extract href links from each div element and append to a list
    for result in search_results:
        link = result.find('a')['href']
        legal_profile_links.append(link)
    count += 10
    print(f"appended {count} contact profiles to legal_profile_links")

print(legal_profile_links)

In [None]:
import random
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Set up the webdriver
options = webdriver.ChromeOptions()
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3')
driver = webdriver.Chrome(options=options)


# Loop over each URL
for url in legal_profile_links:
    # Open the URL in the webdriver
    driver.get(url)
    
    # Add a delay before clicking the checkbox
    time.sleep(random.uniform(2.0, 4.0))

    # Find the checkbox with iframe tag and title='reCAPTCHA'
    iframe = driver.find_element(By.XPATH, '//iframe[@title="reCAPTCHA"]')
    driver.switch_to.frame(iframe)

    # Wait for the checkbox to become clickable
    checkbox = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//span[@class="recaptcha-checkbox goog-inline-block recaptcha-checkbox-unchecked rc-anchor-checkbox"]')))
    
    # Add a delay before clicking the checkbox
    time.sleep(random.uniform(2.0, 4.0))

    # Move the mouse over the checkbox before clicking
    action = webdriver.ActionChains(driver)
    action.move_to_element(checkbox).perform()

    # Click on the checkbox using JavaScript
    driver.execute_script("arguments[0].click();", checkbox)

    # Add a delay after clicking the checkbox
    time.sleep(random.uniform(2.0, 4.0))

    # Switch back to the main frame and use BeautifulSoup to scrape the text from the li tag with class=Name2
    driver.switch_to.default_content()
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    name_list = []
    for name in soup.find_all('li', class_='Name2'):
        name_list.append(name.text)

    # Print the scraped text
    print("Here are the names we found on the page:")
    for name in name_list:
        print(name)
    print()

    # Add a delay before moving on to the next URL
    time.sleep(random.uniform(2.0, 4.0))

# Close the webdriver
driver.quit()
    
