In [None]:
!pip install pyppeteer nest_asyncio asyncio pyyaml
!apt install chromium-chromedriver

Reading package lists... Done
Building dependency tree       
Reading state information... Done
chromium-chromedriver is already the newest version (95.0.4638.69-0ubuntu0.18.04.1).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


In [None]:
# Request
import requests

# Pandas
import pandas as pd

# nest_asyncio
import nest_asyncio
import asyncio

# PyYaml
import yaml

# Python
from time import sleep

# Pyppeteer
from pyppeteer import launch

# BeautifulSoup
from bs4 import BeautifulSoup

# Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
class ComparablyWebScraping:

    def __init__(self):
      self.url = 'https://www.comparably.com/companies'
      self.browser = None
      self.page = None
      self.__config = None
 

    def config(self):
        if not self.__config:
            with open('/content/gdrive/MyDrive/config.yaml', mode='r') as f:
                self.__config = yaml.load(f)
        return self.__config

    async def get_browser(self):
        return await launch(executablePath="/usr/lib/chromium-browser/chromium-browser", args=['--no-sandbox'])

    async def close_browser(self):
        return await self.browser.close()

    async def _page_evaluate(self, query: str):
        query_result = await self.page.evaluate(
            pageFunction=query,
            force_expr=True
        )
        return query_result

    async def get_companies_most_rated(self) -> list:
        config_yml = self.config()['job_sites']['comparably']['queries']
        self.browser = await self.get_browser()
        self.page = await self.browser.newPage()
        await self.page.goto(self.url)
        await self.page.click('a.mostRated')
        await asyncio.sleep(1)
        company_list = await self._page_evaluate(query=config_yml['companies_links'])
        soup = BeautifulSoup(company_list)
        company_links = [i['href'] for i in soup.find_all(class_='companyLink')]
        return company_links


    async def search_data(self, url: str) -> dict:
        config_yml = self.config()['job_sites']['comparably']['queries']
        company_info = {
          'company_name': await self._page_evaluate(query=config_yml['company_name']),
          'ceo': await self._page_evaluate(query=config_yml['ceo_name']),
          'ceo_score': await self._page_evaluate(query=config_yml['ceo_score']),
          'employee_participants': await self._page_evaluate(query=config_yml['employee_participants']),
          'total_ratings': await self._page_evaluate(query=config_yml['total_ratings']),
          'culture_score': await self._page_evaluate(query=config_yml['culture_score']),
        }

        await self.page.goto(f'{url}/reviews')
        await asyncio.sleep(1)
        score_info = {
          'score_positive_reviews': await self._page_evaluate(query=config_yml['score_positive_reviews']),
          'score_negative_reviews': await self._page_evaluate(query=config_yml['score_negative_reviews']),
        }

        cleaned_data =  {**company_info, **score_info}
        print(cleaned_data)
        return cleaned_data

    async def get_company_reputation(self, companies_urls: list) -> list:
        self.page = await self.browser.newPage()
        data = []
        for url in companies_urls:
          await self.page.goto(url)
          await asyncio.sleep(1)
          data.append(await self.search_data(url))
        await self.close_browser()
        return data

In [None]:
async def main():
    scraping_comparably = ComparablyWebScraping()

    links = await scraping_comparably.get_companies_most_rated()
    data = await scraping_comparably.get_company_reputation(links)
    df = pd.DataFrame(data)
    df

nest_asyncio.apply()
asyncio.get_event_loop().run_until_complete(main())

{'company_name': 'Amazon', 'ceo': 'Andy Jassy', 'ceo_score': '74 /100', 'employee_participants': '7953', 'total_ratings': '102767', 'culture_score': '4.5/5', 'score_positive_reviews': '78%', 'score_negative_reviews': '22%'}
{'company_name': 'Walmart', 'ceo': 'Doug McMillon', 'ceo_score': '62 /100', 'employee_participants': '6011', 'total_ratings': '97797', 'culture_score': '3.2/5', 'score_positive_reviews': '67%', 'score_negative_reviews': '33%'}
{'company_name': 'IBM', 'ceo': 'Arvind Krishna', 'ceo_score': '86 /100', 'employee_participants': '3055', 'total_ratings': '83489', 'culture_score': '4.9/5', 'score_positive_reviews': '89%', 'score_negative_reviews': '11%'}
{'company_name': 'Google', 'ceo': 'Sundar Pichai', 'ceo_score': '81 /100', 'employee_participants': '4123', 'total_ratings': '61725', 'culture_score': '4.7/5', 'score_positive_reviews': '85%', 'score_negative_reviews': '15%'}
{'company_name': 'XPO Logistics', 'ceo': 'Bradley Jacobs', 'ceo_score': '82 /100', 'employee_partic