In [None]:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from datetime import date, datetime
import json
from selenium.webdriver.support.ui import Select
import re
from time import sleep
# driver = webdriver.Chrome()


In [None]:
# load the data of company_index.json file in a list

with open('company_index.json') as json_file:
    company_list = json.load(json_file)

company_list[:2]

In [None]:
from urllib.parse import urlparse

# this function collects various information of a company
def get_company_profile_info(company_url, company_name, info_collection_completed, driver):
    attempt=0
    company_info_dict = {}
    contact_details = None
    fetch_url_again = False
    
    while True:
        attempt += 1
            
        try:
            sleep(3)
            
            company_profile = driver.find_elements_by_class_name('CompanyTopInfo_leftContentWrap__3gIch')
            company_details = driver.find_elements_by_class_name('CompanyTopInfo_contentWrapper__2Jkic')
            
            # if the page is not loaded properly, get the page again.
            if not company_details or not company_profile:
                fetch_url_again = True
                break
            
            # if company details section is found, then start collecting informations
            # update the dictionary company_info_dict by the collected values
            if company_details:
                for detail in company_details:
                    spans = detail.find_elements_by_tag_name("span")

                    if spans[0].text == "Revenue":
                        company_revenue = spans[1].text
                        company_info_dict.update({ 
                            "company_revenue": company_revenue
                        })
                    elif spans[0].text == "Head Count":
                        company_employee_size = spans[1].text
                        company_info_dict.update({ 
                            "company_employee_size": company_employee_size
                        })
                    elif spans[0].text == "Industry":
                        company_industry = spans[1].text
                        company_info_dict.update({
                            "company_industry": company_industry
                        })
                    elif spans[0].text == "Location":
                        location = spans[1].text
                        company_info_dict.update({
                            "company_location": location
                        })
            
            # if company profile section is found, then start collecting informations
            if company_profile:            
                try:
                    company_website = company_profile[0].find_elements_by_class_name('CompanyTopInfo_websiteUrl__13kpn')[0].text
                    company_webdomain = urlparse(company_website).netloc.replace("www.", "")
                except:
                    company_website = None
                    company_webdomain = None
                
                # collect the contact information by calling the function get_contact_details()
                if contact_details == None:
                    contact_details = get_contact_details(company_url, company_name, info_collection_completed, driver)
                
                # insert information in a dictionary
                company_info_dict.update({ 
                    "company_name": company_profile[0].find_elements_by_tag_name('h1')[0].text,
                    "company_website": company_website,
                    "company_webdomain": company_webdomain,

                    "contact_details": contact_details,
                })
                
                # check if the crawler collected the required information or not
                if len(company_info_dict) == 8:
                    break
                else:
                    raise Exception("Could not find 8 diff values of company_info_dict")
                
        except Exception as exc:
            print("exception for company_profile_info: ", exc)
            print("name: {}, Url: {}, attempt: {}, total completed: {}".format(
                                            company_name, company_url, attempt,
                                            info_collection_completed))
            
            # If every required info is not collected, then again try to collect those info after 
            # refreshing the page, and stop collecting info after the 2nd attempt to collect required info
            if attempt == 2:
                driver.refresh()
            elif attempt > 2:
                break
    
    return company_info_dict, fetch_url_again

# get_company_profile_info("example", "example", 50, None)

In [None]:
# this function collects the contact information from the details page of a company

def get_contact_details(company_url, company_name, info_collection_completed, driver):
    attempt = 0
    contact_details_list = []
    contact_details = driver.find_elements_by_class_name('TopContacts_roundedBorder__1a3yB')
    
    # if contact details is present in the page, start collecting information
    if contact_details:
        for contact in contact_details:
            contact_dict = {}
            
            while True:
                sleep(3)
                contact_name = contact.find_elements_by_class_name("TopContacts_contactName__3N-_e")[0].text,
                contact_jobtitle = contact.find_elements_by_class_name("TopContacts_jobTitle__3M7A2")[0].text,
                contact_email_domain = contact.find_elements_by_class_name("emailBtn")[0].text.split("@")[1],
                contact_profile_link = contact.find_elements_by_tag_name("a")[0].get_attribute('href'),
                
                # insert information in a dictionary
                contact_dict = {
                    "contact_name": [contact_name[0] if contact_name else None][0],
                    "contact_jobtitle": [contact_jobtitle[0] if contact_jobtitle else None][0],
                    "contact_email_domain": [contact_email_domain[0] if contact_email_domain else None][0],
                    "contact_profile_link": [contact_profile_link[0] if contact_profile_link else None][0],
                }

                # contact location and contact department is not present in the details page of a 
                # company. So, we need to get the details page of each 9 contacts present on each 
                # page to collect the missing information
                contact_link = contact.find_elements_by_tag_name("a")[0].get_attribute('href')
                contact_driver = webdriver.Chrome()
                contact_page = contact_driver.get(contact_link)
                
                try:
                    while True:
                        attempt_to_get_contact_details = 0
                        
                        contact_block = contact_driver.find_elements_by_class_name('ContactTopInfo_contactDetailItem__2lk1x')
                        contact_details = contact_block[0].find_elements_by_class_name("ContactTopInfo_contentWrapper__3VEQ2")

                        # update the dictionary contact_dict with the department and location info
                        for elements in contact_details:
                            spans = elements.find_elements_by_tag_name("span")

                            if spans[0].text == "Department":
                                department = spans[1].text
                                contact_dict.update({
                                    "contact_department": department,
                                })
                            elif spans[0].text == "Location":
                                location = spans[1].text
                                contact_dict.update({
                                    "contact_location": location,
                                })

                        # check if the crawler collected the required information or not
                        if len(contact_dict) == 6:
                            contact_driver.close()
                            break
                        else:
                            attempt_to_get_contact_details += 1
                            
                            # if the required info is not found, try 3 times to collect the info
                            # and after the 3rd try, stop looking for information
                            if attempt_to_get_contact_details > 3:                                
                                contact_driver.close()
                                break
                            else:
                                continue
                    
                    # append the information into the list
                    contact_details_list.append(contact_dict)
                    break
                    
                except Exception as exc:
                    print("exception for contact_details: ", exc)
                    print("name: {}, Url: {}, attempt: {}, total completed: {}".format(
                                                    company_name, company_url, attempt,
                                                    info_collection_completed))
                    attempt += 1
                    contact_driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    
                    # If this is the 2nd attempt to find the info, then refresh the page and
                    # search for the missing info again
                    if attempt == 2:
                        contact_driver.refresh()

                if attempt > 2:
                    # If this is the 3rd attempt to find the info, then refresh the page
                    contact_driver.close()
                    break
                else:
                    continue
                
        # return the list
        return contact_details_list
    


In [None]:
company_profiles = []
info_collection_completed = 4237

In [None]:
# start getting the details page of each company and call the get_company_profile_info function
# to collect required information

for company in company_list[4240:]:
    
    attempt_to_crawl = 0
    fetching_same_url_again = 0
    company_url = company['source_url']
    company_name = company['company_name']
        
    while True:
        try:
            driver = webdriver.Chrome()
            page = driver.get(company_url)

            company_profile, fetch_url_again = get_company_profile_info(company_url, company_name, 
                                                       info_collection_completed, driver)
            
            # if the attempt to load the page is less than 3 times, then again get the page using selenium
            if fetch_url_again and fetching_same_url_again < 3:
                driver.close()
                fetching_same_url_again += 1
                continue
            else:   
                # If the page is not loaded properly after 3 attempts,
                # then close the window and look for the next url
                company_profiles.append(company_profile)

            driver.close()
            break
        
        except Exception as exc:
            print("exception of main function: ", exc)
            print("name: {}, Url: {}, total completed: {}".format(company_name, company_url, info_collection_completed))
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            
            # increase the attempt_to_crawl by 1
            attempt_to_crawl += 1
            # if the attemp to get the page is greater than 2 times, stop searching for info
            # for that url
            if attempt_to_crawl > 2:
                print("Could not find data for - ", company_name)
                break
                
            continue
            
    info_collection_completed +=1
    
    if (info_collection_completed % 100) == 0:
        print("\n***    collected information of {} companies   ***\n".format(info_collection_completed))


company_profiles[:2]

In [None]:
len(company_profiles)
# company_profiles[-1]

In [None]:

# function to add to JSON 
def write_json(data, filename='company_profiles.json'): 
    with open(filename,'w') as f: 
        json.dump(data, f, indent=4) 
      
write_json(company_profiles) 

print("fetched information of {} companies".format(len(company_profiles)))

