In [1]:
# Aim: Obtain ESG Ratings for Biggest Companies in the World automatically

# Reason: ESG Data is a form of alternative data that can be used to analyse companies
# In a world constantly facing enviromental challenges and social issues; companies that have promising ESG data
# will be rewarding to support; and indeed invest in- thus the produced datasheet aims to reflect how diff companies
# around the world score on their ESG; and provide another layer of analysis on which companies are worth investing in.

###
import pandas as pd
import requests  
from bs4 import BeautifulSoup
import re
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
import time
###

In [3]:
index=1 # can be modified to no greater than 20
if index>20:
    print('Highest Page number is 20')
    index=20
end=2 # can be modified for more pages (but it takes approx 10 minutes per page; advise keeping index and end numbers within difference of 2)
if end>20:
    end=20 # end has to be 20 (since the website looks at the top 1000 companies by market cap & each page features 50 company names)
if end<index: # prevents end from being smaller than index
    print('End is less than starting index.\nEnd has been set to the same as index.')
    end=index

start='https://disfold.com/world/companies/?page='

# WHY Disfold?
# > webpage that provides  extensive data on biggest companies (by market cap) around the world
# > readily supplied with good information; including Market Cap, Stock of Company and Country in which it originates and/or operates primarily
# > data is also updated regularly


og=pd.DataFrame()
row_n=1
while index<=end: # paginates until no more pages can be found
    url=start+str(index) # goes to specific page of url in start
    r=(requests.get(url)) 

    if r.status_code != 200: # if page does not give an ok response, exits code entirely
        out=False
        print(f'Code exited. HTTP status code {r.status_code} detected.')
        break
    else: # in the event of an error; notably error 429 (when too many requests are given to the website)
        out=True 
        dic={}
        soup=BeautifulSoup(r.text,'html.parser') # extracts HTML source code
        
        for e in (soup.find_all('tr')[1:]): # used to obtain all rows in the table of the website
            lis=(e.text.split('\n')) # for each row; text information is extracted and separated into a list
            n_lis=[]
            for i in lis:
                if i.strip()=='':
                    pass 
                else:
                    i=i.strip() # removes whitespace from elements
                    n_lis.append(i)
            company=n_lis[1] # company name
            market_cap=n_lis[2] # market cap of company
            stock=n_lis[3]  # stock of company
            country=n_lis[4] # country in which company operates primarily
            sector=n_lis[5] # Sector in which company primarily works in
            industry=n_lis[6] # specific industry in which company deals in

            if country=='United States': # used to query company info on another site (Morningstar> which has ESG info)
                # based on location of company and it's stock name
                stock=stock.replace('.','')
                query='https://www.morningstar.com/search/us-securities?query='+stock
            else:
                stock=stock.replace('.','')
                query='https://www.morningstar.com/search/foreign-securities?query='+stock
            
            # WHY Morningstar?

            # > financial firm service that provides investment research and management services
            # > having a 40% ownership stake in sustainalytics; has access to ESG data of more than 16,000 companies across the market
            # > based on a reliable industry standard
            # > in terms of webscraping; relatively easy to parse through multiple companies
            # > is also free to use (no need for pay for view)

            search_q=requests.get(query)
            q_soup=BeautifulSoup(search_q.text,'html.parser') # extracts HTML source code
            all_href=(q_soup.findAll('a',href=True))
            for e in all_href:
                if '/stocks/' in str(e):
                    stock_url=(e['href'])[:-5] # finds the necessary link for the particular stock
                    break
            
            esg_query='https://www.morningstar.com'+stock_url+'sustainability' # goes to the sustainability aspect of the site for the specific
            # stock; thereby showing info on the ESG rating for the company 

            dr=webdriver.Chrome() # extracts dynamically loaded info (the main reason for why the code runs long)
            dr.get(esg_query)
            time.sleep(2) # allows for webpage to load properly (2s is safest bet btw running code quickly and getting optimal results)
            try:
                number=dr.find_element(by=By.CLASS_NAME, value="text-value") # extracts risk value
                esg_number=float(str(number.get_attribute('innerHTML')).strip())
                category=dr.find_element(by=By.CLASS_NAME, value="text-category") # extracts risk group
                esg_category=str(category.get_attribute('innerHTML')).strip()
                
                # esg_risk_rating=len(dr.find_elements(by=By.CLASS_NAME, value="sal-eqsv-sustainability-rate.rating")) 
                # linked to category group> higher number tallied with lower risk
                # deemed unnecessary
            except:
                row=[company,stock,market_cap,country,sector,industry,None,None] # if error is raised while extracting info in the try block; last 2 columns 
                # given None values
            else:
                row=[company,stock,market_cap,country,sector,industry,esg_number,esg_category] # if no error
                # last 2 columns filled with respective values
            
            
            """
            # The older; faster; code 
            # Issue was due to large number of requests in short period of time on Google search

            query=f'https://www.google.com/search?q="sustainalytics" {company} esg'
            dr.get(query) # PROBLEM> too many requests to google will just not validate it

            # Required for url to show up as one of the first few search results

            # Could not search on site directly owing to need for an id to be added to the url

            for e in BeautifulSoup(dr.page_source,'html.parser').findAll('a',href=True):
                if 'https://www.sustainalytics.com/esg-rating' in (e['href']):
                    search=re.findall('(https://.*?)&',str(e['href']))[0]
                    break
            else:
                search=False
            if search:
                search_url=search 
                r=requests.get(search_url)
                print(r)
                risk_score=float(BeautifulSoup(r.text,'html.parser').find('div',{'class':'col-6 risk-rating-score'}).text)
                print(risk_score)
                risk_rating=(BeautifulSoup(r.text,'html.parser').find('div',{'class':'col-6 risk-rating-assessment'}).text)
                group=(BeautifulSoup(r.text,'html.parser').find('strong',{'class':'industry-group'}).text)
                row=[company,stock,market_cap,country,risk_score,risk_rating,group]
                #print('y')
            else:
                row=[company,stock,market_cap,country,None,None,None]
                #print('n')
                """            
            dic[row_n]=row 
            row_n+=1
        df = pd.DataFrame.from_dict(dic,orient='index')
        
    index+=1
    og=pd.concat([og,df],axis=0)
if out:
    og.columns=['Company','Stock','Market_Cap','Country','Sector','Industry','ESG_Risk_Value','ESG_Risk_Rating']

In [4]:
og # To display how the dataframe looks like in pandas

Unnamed: 0,Company,Stock,Market_Cap,Country,Sector,Industry,ESG_Risk_Value,ESG_Risk_Rating
1,Apple Inc.,AAPL,$2.067 T,United States,Technology,Consumer Electronics,16.68,Low
2,Saudi Arabian Oil Company,2222,$1.883 T,Saudi Arabia,Energy,Oil & Gas Integrated,45.38,Severe
3,Microsoft Corporation,MSFT,$1.788 T,United States,Technology,Software—Infrastructure,15.01,Low
4,Alphabet Inc.,GOOG,$1.158 T,United States,Communication Services,Internet Content & Information,24.60,Medium
5,"Amazon.com, Inc.",AMZN,$875.51 B,United States,Consumer Discretionary,Internet Retail,30.28,High
...,...,...,...,...,...,...,...,...
96,Caterpillar Inc.,CAT,$124.41 B,United States,Industrials,Farm & Heavy Construction Machinery,34.30,High
97,Sanofi,SAN,$124.09 B,France,Healthcare,Drug Manufacturers—General,21.46,Medium
98,Qualcomm Incorporated,QCOM,$123.24 B,United States,Technology,Semiconductors,14.48,Low
99,China Life Insurance Company Limited,601628,$123.01 B,China,Financials,Insurance—Life,25.41,Medium


In [5]:
og.to_excel('Company_ESG_Info.xlsx') # exports the collated data to an excel sheet