# Imports

In [1]:
!pip install pypdf2



In [2]:
pip install pycryptodome==3.15.0

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install PyPDF2==2.11.2

Note: you may need to restart the kernel to use updated packages.


In [19]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfFileReader
import io
import seaborn as sns
import matplotlib.pyplot as plt
from threading import Thread

# Initialization

In [5]:
df_stb = pd.read_csv('sustainability_pdf.csv')
df_wrd = pd.read_csv('keywords.csv')
df_stb_news = pd.read_csv('sustainability_news.csv')

In [6]:
df_stb = df_stb[df_stb["Sustainability report : "] == True]
df_stb.drop(["No: ", "Sustainability report : ", "Unnamed: 4", "Unnamed: 5"], axis=1, inplace=True)
df_stb["Links/Pdf : "] = df_stb["Links/Pdf : "].str.replace(" ", "")

df_stb_news.drop(["No", "Purpose"], axis=1, inplace=True)
df_stb_news.rename(columns={"Link":"Links/Pdf : ", "Company":"Companies :"}, inplace=True)

In [7]:
df_stb_news.head()

Unnamed: 0,Companies :,Links/Pdf :
0,Google Cloud,https://cloud.google.com/blog/topics/sustainab...
1,Schneider Electric,https://www.eco-business.com/press-releases/sc...
2,GoTo,https://www.techinasia.com/decarbonization-goto
3,Grab,https://greennetwork.asia/news/grab-declares-a...
4,RDA & STACS,https://finance.yahoo.com/news/singapores-data...


In [8]:
df_stb.head()

Unnamed: 0,Companies :,Links/Pdf :
0,Apple,https://www.apple.com/environment/pdf/Apple_En...
1,PwC,https://www.pwc.com/sg/en/publications/assets/...
2,DBS,https://www.dbs.com/iwov-resources/images/sust...
3,Grab,https://sustainability2021.grab.com/sg/docs/gr...
4,Singtel,https://www.singtel.com/content/dam/singtel/ab...


In [9]:
df_wrd = df_wrd[["KEYWORD", "TIER"]]
df_wrd.KEYWORD = df_wrd.KEYWORD.str.lower()

In [10]:
df_wrd.head(5)

Unnamed: 0,KEYWORD,TIER
0,carbon offset,3
1,digital twin,3
2,energy used,3
3,environmental footprint,3
4,it operations,3


# Scraping

In [11]:
def scrape(url, dict, key):
    headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
    
    response = requests.get(url, headers=headers)
    content_type = response.headers.get('content-type')
    if response.status_code != 200:
        print("error in", url)
        return

    # Scraping PDF
    if 'application/pdf' in content_type or 'application/xml' in content_type:
        with io.BytesIO(response.content) as f:
            pdf = PdfFileReader(f, strict=False)
            words = []
            for i in range(pdf.getNumPages()):
                words.extend(pdf.getPage(i).extract_text().lower().split())
            dict[key] = ' '.join(words)
    
    # Scraping html
    elif 'text/html' in content_type:
        soup = BeautifulSoup(response.content, "html.parser")
        dict[key] = ' '.join(soup.get_text().split())

In [12]:
def process_keywords(words, df_kywrd, tier):
    tier_count = 0
    kywrd = df_kywrd[df_kywrd.TIER == tier]["KEYWORD"]
    for wrd in kywrd:
        tier_count += words.count(wrd)
    return tier_count
    

In [13]:
# Score
# Tier 1: 3
# Tier 2: 2
# Tier 3: 1
def process_web(df_main, df_keyword):
    new_df = df_main.copy(deep=True)
    # Using multiThreading
    temp_dict = {}
    all_threads = []
    for i, j in new_df.iterrows():
        t = Thread(target=scrape, args=(df_main["Links/Pdf : "][i], temp_dict, i))
        t.start()
        all_threads.append(t)

    # Waiting for all threads to finish
    for thread in all_threads:
        thread.join()    

    try:
        new_df["Words"] = list(i[1] for i in sorted(temp_dict.items(), key=lambda x: x[0]))
        for i in range(1, 4):
            new_df[f"Tier {i} Count"] = new_df["Words"].apply(process_keywords, df_kywrd=df_keyword, tier=i)
        
        new_df["Score"] =  new_df.apply(
            lambda x: x['Tier 1 Count']*3+x['Tier 2 Count']*2+x['Tier 3 Count']*1, axis=1)
    except:
        print("check if the link is working")
        return
    
    return new_df
        

In [14]:
data = process_web(df_stb[:10], df_wrd)

In [15]:
# Scraping pdf

data.sort_values('Score', ascending=False)
data.groupby(['Companies :']).agg(lambda x : x.sum() if x.dtype=='int64' else ' '.join(x))

Unnamed: 0_level_0,Links/Pdf :,Words,Tier 1 Count,Tier 2 Count,Tier 3 Count,Score
Companies :,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Apple,https://www.apple.com/environment/pdf/Apple_En...,covering fiscal year 2021environmental progres...,445,206,249,1996
Changi Airport Group,https://www.changiairport.com/content/dam/caco...,forging a sustainable changi sustainability re...,215,60,9,774
DBS,https://www.dbs.com/iwov-resources/images/sust...,empowering a sustainable futuredbs group holdi...,552,113,31,1913
Foodpanda,https://www.foodpanda.com/wp-content/uploads/2...,foodpanda social impact report 2021 19contents...,13,4,0,47
GovTech,https://www.tech.gov.sg/files/media/corporate-...,annualreport20212022annualreport20212022 advan...,2,4,1,15
Grab,https://sustainability2021.grab.com/sg/docs/gr...,millions of new beginnings esg report 2021 tab...,199,53,17,720
PwC,https://www.pwc.com/sg/en/publications/assets/...,sustainability report pwc singapore – financia...,72,11,0,238
Sea,https://cdn.sea.com/webmain/static/resource/se...,2021 sea sustainability report what’s inside 2...,66,7,1,213
Singapore Airlines,https://www.singaporeair.com/saar5/pdf/Investo...,singapore airlines is a global company dedicat...,97,42,12,387
Singtel,https://www.singtel.com/content/dam/singtel/ab...,sustainability report 2022empower every genera...,367,86,23,1296


In [16]:
# Scraping Web
data_2 = process_web(df_stb_news, df_wrd)
data_2.sort_values('Score', ascending=False)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Unnamed: 0,Companies :,Links/Pdf :,Words,Tier 1 Count,Tier 2 Count,Tier 3 Count,Score
6,Microsoft,https://blogs.microsoft.com/blog/2020/01/16/mi...,Microsoft will be carbon negative by 2030 - Th...,29,25,0,137
10,Apple,https://www.apple.com/newsroom/2022/10/apple-c...,Apple calls on global supply chain to decarbon...,18,18,0,90
1,Schneider Electric,https://www.eco-business.com/press-releases/sc...,Schneider Electric launches SME kickstarter de...,25,4,1,84
15,OCBC,https://www.ocbc.com/group/media/release/2022/...,ocbc-to-invest-25million-in-decarbonisation Ab...,16,13,3,77
3,Grab,https://greennetwork.asia/news/grab-declares-a...,Grab Declares a Commitment to Sustainability w...,9,9,2,47
9,Sony Group,https://www.sony.com/en/SonyInfo/News/Press/20...,Sony Group Portal - Sony accelerates target to...,6,11,6,46
12,Temasek Holdings,https://www.temasek.com.sg/en/news-and-resourc...,"Temasek launches GenZero, an investment platfo...",12,5,0,46
7,IBM,https://newsroom.ibm.com/2021-02-16-IBM-Commit...,IBM Commits To Net Zero Greenhouse Gas Emissio...,12,4,0,44
8,JPMorgan,https://www.jpmorganchase.com/news-stories/jpm...,JPMorgan Chase Releases Carbon Reduction Targe...,8,9,0,42
13,Foodpanda,https://www.eco-business.com/press-releases/fo...,Foodpanda launches 'Green Label' restaurant ce...,12,2,1,41


In [17]:
combined_data = pd.concat([data, data_2])
final_data = combined_data.groupby(['Companies :'], as_index=False).agg(lambda x : x.sum() if x.dtype=='int64' else ' '.join(x)).sort_values('Score', ascending=False)
final_data

Unnamed: 0,Companies :,Links/Pdf :,Words,Tier 1 Count,Tier 2 Count,Tier 3 Count,Score
0,Apple,https://www.apple.com/environment/pdf/Apple_En...,covering fiscal year 2021environmental progres...,463,224,249,2086
2,DBS,https://www.dbs.com/iwov-resources/images/sust...,empowering a sustainable futuredbs group holdi...,559,117,31,1942
17,Singtel,https://www.singtel.com/content/dam/singtel/ab...,sustainability report 2022empower every genera...,367,86,23,1296
1,Changi Airport Group,https://www.changiairport.com/content/dam/caco...,forging a sustainable changi sustainability re...,215,60,9,774
7,Grab,https://sustainability2021.grab.com/sg/docs/gr...,millions of new beginnings esg report 2021 tab...,208,62,19,767
16,Singapore Airlines,https://www.singaporeair.com/saar5/pdf/Investo...,singapore airlines is a global company dedicat...,97,42,12,387
12,PwC,https://www.pwc.com/sg/en/publications/assets/...,sustainability report pwc singapore – financia...,72,11,0,238
15,Sea,https://cdn.sea.com/webmain/static/resource/se...,2021 sea sustainability report what’s inside 2...,66,7,1,213
10,Microsoft,https://blogs.microsoft.com/blog/2020/01/16/mi...,Microsoft will be carbon negative by 2030 - Th...,29,25,0,137
3,Foodpanda,https://www.foodpanda.com/wp-content/uploads/2...,foodpanda social impact report 2021 19contents...,25,6,1,88


# Vizualisation

In [18]:
fig, ax = plt.subplots(figsize=(40, 10))
sns.set_context("notebook", font_scale=1.5)
processed_data = pd.melt(final_data[:10].drop(["Links/Pdf : ", "Words", "Score"], axis=1), id_vars="Companies :")
sns.barplot(data=processed_data, x="Companies :", y="value", hue="variable", ax=ax)
plt.title("Word count in each company")
plt.show()

NameError: name 'plt' is not defined

In [None]:
fig, ax = plt.subplots(figsize=(40, 8))
sns.set_context("notebook", font_scale=2.5)
sns.barplot(data=final_data[:10].sort_values('Score', ascending=False), 
            x="Companies :", y="Score", ax=ax)
plt.title("Companies Score")
plt.show()