In [2]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
df = pd.read_csv("kw_rerun_links.csv")

url_list = df['agent_url'].unique()
def get_driver():
    options = webdriver.FirefoxOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Firefox(options=options)
    return driver


def collect_agent_contact_details(driver):
    try:
        agent_info = driver.find_element_by_class_name("AgentInformation")
    except:
        agent_info = ""
    try:
        info_container = agent_info.find_elements_by_class_name("AgentInformation__factBody")
    except:
        info_container = ""
    try:
        agent_email = [email.text for email in info_container if email.get_attribute("aria-label") == 'Agent E-mail'][0]
    except:
        agent_email = ""
    try:
        mobile_number = driver.find_element_by_class_name("AgentInformation__phoneMobileNumber").text
    except:
        mobile_number = ""
    try:
        office_number = driver.find_element_by_class_name("AgentInformation__phoneOfficeNumber").text
    except:
        office_number = ""
    try:
        agent_website = [i.text for i in info_container if i.get_attribute("aria-label") == 'Agent Website'][0]
    except:
        agent_website = ""
    contact_dict = {"agent_email":agent_email,"mobile_number":mobile_number,"office_number":office_number,'agent_website':agent_website }
    return contact_dict


def get_agent_details(driver):
    try:
        agent_licenses = driver.find_element_by_class_name("AgentContent__licenses").text
    except:
        agent_licenses = ""
    try:
        agent_bio = driver.find_element_by_class_name("AgentContent__bio").text
    except:
        agent_bio = ""
    try:
        serviceAreas = driver.find_element_by_class_name("AgentContent__serviceAreas").text
    except:
        serviceAreas = ""
    try:
        agent_team_name = driver.find_element_by_class_name("AgentContent__teamName").text
    except:
        agent_team_name = ""
    try:
        agent_team_info_ = driver.find_element_by_class_name("AgentContent__teamText").text.split("\n")
        agent_team_info = ", ".join(agent_team_info_)
    except:
        agent_team_info = ""
    try:
        logo_url = driver.find_element_by_class_name('AgentContent__teamAvatar').find_element_by_class_name("KWImage__image").get_attribute("src")
    except:
        logo_url = ""
    agent_details = {"agent_licenses":agent_licenses, "agent_bio":agent_bio, "serviceAreas":serviceAreas,"agent_team_name":agent_team_name,
                    "agent_team_info":agent_team_info,'logo_url':logo_url}
    return agent_details


def social_media_details(driver):
    try:
        social_media_container = driver.find_element_by_class_name("AgentInformation__socialMedia").find_elements_by_class_name("link")
    except:
        social_media_container = []
    social_media_links = [social_media.get_attribute("href") for social_media in social_media_container]
    try:
        facebook = [facebook for facebook in social_media_links if 'facebook' in facebook][0]
    except:
        facebook = ""
    try:
        instagram = [instagram for instagram in social_media_links if 'instagram' in instagram][0]
    except:
        instagram = ""
    try:
        twitter = [twitter for twitter in social_media_links if 'twitter' in twitter][0]
    except:
        twitter = ""
    try:
        linkedin = [linkedin for linkedin in social_media_links if 'linkedin' in linkedin][0]
    except:
        linkedin = ""
    social_media_dict = {"facebook":facebook,"instagram":instagram,"twitter":twitter,"linkedin":linkedin}
    social_media_links_str = "|".join(social_media_links)
    return social_media_dict,social_media_links_str


def collect_other_info(driver):
    container = driver.find_elements_by_class_name("AgentContent__section")
    try:
        Market_Cente = [i for i in container if 'Market Center' in i.text][0].find_element_by_class_name("AgentContent__sectionText").text
    except:
        Market_Cente = ""
    try:
        Languages = [i for i in container if 'Languages' in i.text][0].find_element_by_class_name("AgentContent__sectionText").text
    except:
        Languages = ""
    try:
        Specialties_designation = [i for i in container if 'Specialties and Designations' in i.text][0].find_element_by_class_name("AgentContent__sectionText").text
    except:
        Specialties_designation = ""
    other_info = {"Market_Cente":Market_Cente, "Languages":Languages, "Specialties_Designations":Specialties_designation}
    return other_info

def wait_to_page_load(driver):
    try:
        w = WebDriverWait(driver, 3)
        w.until(EC.presence_of_element_located((By.CLASS_NAME,"AgentContent__name")))
        print("Page load happened")
    except Exception as e:
        print("Timeout happened no page load")
    return driver

url_count = 35321 
driver_count = 0
driver = get_driver()
for url in url_list[url_count:36000]:
    print(url)
    if driver_count >=100:
        print("need to close driver")
        driver.quit()
        driver_count = 0
        driver = get_driver()
        print("new driver initialize****************")
    driver.get(url)
    driver = wait_to_page_load(driver)
    try:
        agent_name = driver.find_element_by_class_name("AgentContent__name").text
    except:
        agent_name = ""
    try:
        time.sleep(0.5)
        profile_url = driver.find_element_by_class_name("AvatarImage__bg").get_attribute("style").replace('background-image: url("','').replace('");','')
    except:
        profile_url = ""
    try:
        tag = driver.find_element_by_class_name("pill").text
    except:
        tag = ""
    try:
        role = driver.find_element_by_class_name("AgentContent__team").text
    except:
        role = ""
    try:
        location = driver.find_element_by_class_name("AgentContent__location").text
    except:
        location = ""
    contact_dict = collect_agent_contact_details(driver)
    agent_details = get_agent_details(driver)
    social_media_dict,social_media_links_str = social_media_details(driver)
    other_info = collect_other_info(driver)
    data_dict = {'agent_url':url,"agent_name":agent_name, 'agent_role':role,'location':location,"agent_dp":profile_url, "tag":tag,"agent_email":contact_dict['agent_email'], "mobile_number":contact_dict['mobile_number'],"office_number":contact_dict['office_number'],'agent_license':agent_details['agent_licenses'], 'agent_bio':agent_details['agent_bio'],'serviceAreas':agent_details['serviceAreas'],'office_name':agent_details['agent_team_name'],'office_address':agent_details['agent_team_info'],'logo_url':agent_details['logo_url'],'facebook':social_media_dict['facebook'], 'instagram':social_media_dict['instagram'],'twitter':social_media_dict['twitter'],'linkedin':social_media_dict['linkedin'],'Market_Cente':other_info['Market_Cente'],'agent_language':other_info['Languages'],'Specialties_Designations':other_info['Specialties_Designations'],'social_media_links_str':social_media_links_str,'agent_website':contact_dict['agent_website']}
    data_df = pd.DataFrame(data_dict,index=[0],columns=['agent_url','agent_name', 'agent_role','location','agent_dp', 'tag','agent_email', 'mobile_number','office_number', 'agent_license', 'agent_bio', 'serviceAreas','office_name', 'office_address','logo_url','facebook', 'instagram', 'twitter', 'linkedin','Market_Cente', 'agent_language', 'Specialties_Designations','social_media_links_str','agent_website'])
    with open("kw_data-emailSelenium_35k_36k.csv",'a',newline='',encoding='utf-8') as f:
        data_df.to_csv(f, mode='a',header=f.tell()==0)
    print(url_count,url)
    url_count+=1
    driver_count+=1
    print("***********************************")
driver.quit()


https://kw.com/agent/UPA-6587385240579362818-2
Page load happened
35321 https://kw.com/agent/UPA-6587385240579362818-2
***********************************
https://kw.com/agent/UPA-6587385206672347138-1
Page load happened
35322 https://kw.com/agent/UPA-6587385206672347138-1
***********************************
https://kw.com/agent/UPA-6587385227768643585-1
Page load happened
35323 https://kw.com/agent/UPA-6587385227768643585-1
***********************************
https://kw.com/agent/UPA-6779818420050759680-4
Page load happened
35324 https://kw.com/agent/UPA-6779818420050759680-4
***********************************
https://kw.com/agent/UPA-6587385099161206786-9
Page load happened
35325 https://kw.com/agent/UPA-6587385099161206786-9
***********************************
https://kw.com/agent/UPA-6694747584536612864-6
Page load happened
35326 https://kw.com/agent/UPA-6694747584536612864-6
***********************************
https://kw.com/agent/UPA-6709469866950262784-1
Timeout happened no pag

Page load happened
35374 https://kw.com/agent/UPA-6592942852812247042-0
***********************************
https://kw.com/agent/UPA-6587385215011950592-4
Page load happened
35375 https://kw.com/agent/UPA-6587385215011950592-4
***********************************
https://kw.com/agent/UPA-6773341265498030080-7
Page load happened
35376 https://kw.com/agent/UPA-6773341265498030080-7
***********************************
https://kw.com/agent/UPA-6839657467795927040-4
Page load happened
35377 https://kw.com/agent/UPA-6839657467795927040-4
***********************************
https://kw.com/agent/UPA-6834598352654434304-3
Page load happened
35378 https://kw.com/agent/UPA-6834598352654434304-3
***********************************
https://kw.com/agent/UPA-6587385209197694976-9
Page load happened
35379 https://kw.com/agent/UPA-6587385209197694976-9
***********************************
https://kw.com/agent/UPA-6587385325586472963-4
Page load happened
35380 https://kw.com/agent/UPA-6587385325586472963-

Page load happened
35427 https://kw.com/agent/UPA-6587385194619371525-0
***********************************
https://kw.com/agent/UPA-6752337614438875136-0
Page load happened
35428 https://kw.com/agent/UPA-6752337614438875136-0
***********************************
https://kw.com/agent/UPA-6826912303257247744-1
Page load happened
35429 https://kw.com/agent/UPA-6826912303257247744-1
***********************************
https://kw.com/agent/UPA-6587385317406228483-9
Page load happened
35430 https://kw.com/agent/UPA-6587385317406228483-9
***********************************
https://kw.com/agent/UPA-6587385254716645379-9
Page load happened
35431 https://kw.com/agent/UPA-6587385254716645379-9
***********************************
https://kw.com/agent/UPA-6587385388364218375-3
Page load happened
35432 https://kw.com/agent/UPA-6587385388364218375-3
***********************************
https://kw.com/agent/UPA-6802980405136650240-2
Timeout happened no page load
35433 https://kw.com/agent/UPA-680298040

Page load happened
35480 https://kw.com/agent/UPA-6587385075855822855-2
***********************************
https://kw.com/agent/UPA-6587385163971756032-7
Page load happened
35481 https://kw.com/agent/UPA-6587385163971756032-7
***********************************
https://kw.com/agent/UPA-6785239313955692544-7
Page load happened
35482 https://kw.com/agent/UPA-6785239313955692544-7
***********************************
https://kw.com/agent/UPA-6587385109094031362-0
Page load happened
35483 https://kw.com/agent/UPA-6587385109094031362-0
***********************************
https://kw.com/agent/UPA-6592263194613387265-3
Page load happened
35484 https://kw.com/agent/UPA-6592263194613387265-3
***********************************
https://kw.com/agent/UPA-6587385245366767621-4
Page load happened
35485 https://kw.com/agent/UPA-6587385245366767621-4
***********************************
https://kw.com/agent/UPA-6587385181154463745-9
Page load happened
35486 https://kw.com/agent/UPA-6587385181154463745-

Page load happened
35533 https://kw.com/agent/UPA-6587385410418577413-9
***********************************
https://kw.com/agent/UPA-6841408270704152576-0
Page load happened
35534 https://kw.com/agent/UPA-6841408270704152576-0
***********************************
https://kw.com/agent/UPA-6721896639208304640-5
Page load happened
35535 https://kw.com/agent/UPA-6721896639208304640-5
***********************************
https://kw.com/agent/UPA-6587385204330627074-6
Page load happened
35536 https://kw.com/agent/UPA-6587385204330627074-6
***********************************
https://kw.com/agent/UPA-6722575547460886528-9
Page load happened
35537 https://kw.com/agent/UPA-6722575547460886528-9
***********************************
https://kw.com/agent/UPA-6587385317448171524-8
Page load happened
35538 https://kw.com/agent/UPA-6587385317448171524-8
***********************************
https://kw.com/agent/UPA-6587385230398488578-3
Page load happened
35539 https://kw.com/agent/UPA-6587385230398488578-

Page load happened
35586 https://kw.com/agent/UPA-6587385168545071104-4
***********************************
https://kw.com/agent/UPA-6716793037611839488-1
Page load happened
35587 https://kw.com/agent/UPA-6716793037611839488-1
***********************************
https://kw.com/agent/UPA-6587385407063195650-6
Page load happened
35588 https://kw.com/agent/UPA-6587385407063195650-6
***********************************
https://kw.com/agent/UPA-6587385288136515588-9
Page load happened
35589 https://kw.com/agent/UPA-6587385288136515588-9
***********************************
https://kw.com/agent/UPA-6587385101852282887-4
Page load happened
35590 https://kw.com/agent/UPA-6587385101852282887-4
***********************************
https://kw.com/agent/UPA-6629058433711697920-4
Page load happened
35591 https://kw.com/agent/UPA-6629058433711697920-4
***********************************
https://kw.com/agent/UPA-6587385223426015236-1
Timeout happened no page load
35592 https://kw.com/agent/UPA-658738522

Page load happened
35639 https://kw.com/agent/UPA-6587385236226633730-8
***********************************
https://kw.com/agent/UPA-6705242000375304192-4
Page load happened
35640 https://kw.com/agent/UPA-6705242000375304192-4
***********************************
https://kw.com/agent/UPA-6587385192177422337-5
Page load happened
35641 https://kw.com/agent/UPA-6587385192177422337-5
***********************************
https://kw.com/agent/UPA-6587385174227779587-7
Page load happened
35642 https://kw.com/agent/UPA-6587385174227779587-7
***********************************
https://kw.com/agent/UPA-6587384984188207106-4
Timeout happened no page load
35643 https://kw.com/agent/UPA-6587384984188207106-4
***********************************
https://kw.com/agent/UPA-6587385202874494981-4
Page load happened
35644 https://kw.com/agent/UPA-6587385202874494981-4
***********************************
https://kw.com/agent/UPA-6587385179328737287-3
Page load happened
35645 https://kw.com/agent/UPA-658738517

Page load happened
35692 https://kw.com/agent/UPA-6587384989864742913-6
***********************************
https://kw.com/agent/UPA-6587385266618753032-6
Page load happened
35693 https://kw.com/agent/UPA-6587385266618753032-6
***********************************
https://kw.com/agent/UPA-6773023916239884288-3
Page load happened
35694 https://kw.com/agent/UPA-6773023916239884288-3
***********************************
https://kw.com/agent/UPA-6587385212861956099-9
Page load happened
35695 https://kw.com/agent/UPA-6587385212861956099-9
***********************************
https://kw.com/agent/UPA-6587385305417957383-1
Page load happened
35696 https://kw.com/agent/UPA-6587385305417957383-1
***********************************
https://kw.com/agent/UPA-6587385295981285378-6
Page load happened
35697 https://kw.com/agent/UPA-6587385295981285378-6
***********************************
https://kw.com/agent/UPA-6587385291375087616-9
Page load happened
35698 https://kw.com/agent/UPA-6587385291375087616-

Page load happened
35745 https://kw.com/agent/UPA-6587385185253363716-4
***********************************
https://kw.com/agent/UPA-6587385184077795333-3
Page load happened
35746 https://kw.com/agent/UPA-6587385184077795333-3
***********************************
https://kw.com/agent/UPA-6587385197979557892-3
Page load happened
35747 https://kw.com/agent/UPA-6587385197979557892-3
***********************************
https://kw.com/agent/UPA-6587384978511818753-0
Page load happened
35748 https://kw.com/agent/UPA-6587384978511818753-0
***********************************
https://kw.com/agent/UPA-6587385146666627072-3
Page load happened
35749 https://kw.com/agent/UPA-6587385146666627072-3
***********************************
https://kw.com/agent/UPA-6592978207335022597-9
Page load happened
35750 https://kw.com/agent/UPA-6592978207335022597-9
***********************************
https://kw.com/agent/UPA-6587385212916482048-0
Page load happened
35751 https://kw.com/agent/UPA-6587385212916482048-

Page load happened
35798 https://kw.com/agent/UPA-6587385403300265984-2
***********************************
https://kw.com/agent/UPA-6587384991421992963-5
Timeout happened no page load
35799 https://kw.com/agent/UPA-6587384991421992963-5
***********************************
https://kw.com/agent/UPA-6587385327773687808-6
Page load happened
35800 https://kw.com/agent/UPA-6587385327773687808-6
***********************************
https://kw.com/agent/UPA-6587385279569981448-5
Page load happened
35801 https://kw.com/agent/UPA-6587385279569981448-5
***********************************
https://kw.com/agent/UPA-6587385274763321352-7
Page load happened
35802 https://kw.com/agent/UPA-6587385274763321352-7
***********************************
https://kw.com/agent/UPA-6833511068578009088-8
Page load happened
35803 https://kw.com/agent/UPA-6833511068578009088-8
***********************************
https://kw.com/agent/UPA-6701949209433821184-9
Page load happened
35804 https://kw.com/agent/UPA-670194920

35850 https://kw.com/agent/UPA-6587385294997221377-5
***********************************
https://kw.com/agent/UPA-6587385427553374212-4
Page load happened
35851 https://kw.com/agent/UPA-6587385427553374212-4
***********************************
https://kw.com/agent/UPA-6587385266635530241-5
Page load happened
35852 https://kw.com/agent/UPA-6587385266635530241-5
***********************************
https://kw.com/agent/UPA-6824028600493154304-0
Page load happened
35853 https://kw.com/agent/UPA-6824028600493154304-0
***********************************
https://kw.com/agent/UPA-6587385302248116224-7
Page load happened
35854 https://kw.com/agent/UPA-6587385302248116224-7
***********************************
https://kw.com/agent/UPA-6587385412005720066-8
Page load happened
35855 https://kw.com/agent/UPA-6587385412005720066-8
***********************************
https://kw.com/agent/UPA-6843190274002575360-1
Page load happened
35856 https://kw.com/agent/UPA-6843190274002575360-1
*****************

Page load happened
35903 https://kw.com/agent/UPA-6587385230331379720-0
***********************************
https://kw.com/agent/UPA-6645350529910263810-4
Page load happened
35904 https://kw.com/agent/UPA-6645350529910263810-4
***********************************
https://kw.com/agent/UPA-6587385194518708230-9
Page load happened
35905 https://kw.com/agent/UPA-6587385194518708230-9
***********************************
https://kw.com/agent/UPA-6587385260413251584-8
Page load happened
35906 https://kw.com/agent/UPA-6587385260413251584-8
***********************************
https://kw.com/agent/UPA-6587385194564845568-2
Page load happened
35907 https://kw.com/agent/UPA-6587385194564845568-2
***********************************
https://kw.com/agent/UPA-6587385194665508864-1
Page load happened
35908 https://kw.com/agent/UPA-6587385194665508864-1
***********************************
https://kw.com/agent/UPA-6587385260400668678-1
Page load happened
35909 https://kw.com/agent/UPA-6587385260400668678-

Page load happened
35956 https://kw.com/agent/UPA-6666264313618309120-8
***********************************
https://kw.com/agent/UPA-6724387298944815104-0
Page load happened
35957 https://kw.com/agent/UPA-6724387298944815104-0
***********************************
https://kw.com/agent/UPA-6587385347117580292-0
Page load happened
35958 https://kw.com/agent/UPA-6587385347117580292-0
***********************************
https://kw.com/agent/UPA-6587385206785593350-1
Page load happened
35959 https://kw.com/agent/UPA-6587385206785593350-1
***********************************
https://kw.com/agent/UPA-6587385266601975814-2
Page load happened
35960 https://kw.com/agent/UPA-6587385266601975814-2
***********************************
https://kw.com/agent/UPA-6587385086934683653-2
Page load happened
35961 https://kw.com/agent/UPA-6587385086934683653-2
***********************************
https://kw.com/agent/UPA-6841513850151411712-2
Page load happened
35962 https://kw.com/agent/UPA-6841513850151411712-