In [1]:
# Importing necessary libraries
import sys, os

# Setting the root directory as a string.
root = r'<ROOT LOCATION OF REPORT FOLDER>'

# Combining the root directory with the target directory to create the full path
path = os.path.join(root, "targetdirectory")

# Create empty lists to store the full file paths and file names
fullFilePaths = []
fileNames = []

# Walk through all the subdirectories and files within the root directory
for path, subdirs, files in os.walk(root):
    # Loop through all the files in each subdirectory
    for name in files:
        # Create the full file path by joining the path and file name
        fullFilePaths.append(os.path.join(path, name))

# The final output will be the full file paths of all the files within the specified root directory.

In [2]:
# Import the pandas library to work with dataframes
import pandas as pd

# Specify the filepath of the GRI excel file using a raw string to prevent backslash escaping
griFullFileName = r'<GRI FILE LOCATION>'

# Open the GRI excel file using pd.ExcelFile() and save it to the variable xls
xls = pd.ExcelFile(griFullFileName)

# Create an empty dictionary to store the dataframes
fullGRI = {}

# Loop through each sheet in the Excel file except for the "Overview" sheet
# and read the sheet into a dataframe using pd.read_excel()
# Add each dataframe to the fullGRI dictionary with the sheet name as the key
for sheet_name in xls.sheet_names[1:21]:
    fullGRI[sheet_name] = pd.read_excel(griFullFileName, sheet_name = sheet_name, header=1)

# Read the "Overview" sheet into a dataframe separately and add it to the fullGRI dictionary with the key "Overview"
fullGRI["Overview"] = pd.read_excel(griFullFileName, sheet_name = "Overview")

# Create a new dataframe named total_df by concatenating all dataframes in the fullGRI dictionary except for the "Overview" sheet
# Set ignore_index argument to True to create a new index for the concatenated dataframe
total_df = fullGRI["1999"]
for sheet_name in xls.sheet_names[2:21]:
    total_df = pd.concat([total_df, fullGRI[sheet_name]], ignore_index=True)

# Modify the 'Name' column in total_df by removing all spaces using the str.replace() method
total_df['Name'] = total_df['Name'].str.replace(' ', '')

# The resulting concatenated dataframe total_df contains all GRI data from 1999 to 2018

In [3]:
# Import the regular expressions library to work with patterns
import re

# Create an empty dataframe with two columns named "Organization" and "Year"
df = pd.DataFrame(columns=["Organization", "Year"])

# Loop through each file path in the fullFilePaths list
for filePath in fullFilePaths:
    # Extract the file name without extension from the file path
    fileName = os.path.splitext(os.path.split(filePath)[1])[0]

    # Create a list of two items by splitting the file name using "_" as a separator
    # The first item is the organization name, and the second item is the year
    row = [fileName.split("_")[0], fileName.split("_")[-1]]

    # Add the row to the dataframe at the next available index using df.loc[]
    df.loc[len(df)] = row

# Convert the "Year" column to numeric values using pd.to_numeric()
df['Year'] = pd.to_numeric(df['Year'])

# The resulting dataframe df contains information about the PDF files, including the organization name and year

In [4]:
# Import the numpy library for numerical operations
import numpy as np

# Merge the PDF files dataframe (df) with the GRI dataframe (total_df) using the organization name and year as the keys
almost_df = pd.merge(df, total_df,  how='left', left_on=['Organization','Year'], right_on = ['Name','Publication Year'])

# Drop the "Name" and "Publication Year" columns from the merged dataframe
final_df = almost_df.drop(['Name','Publication Year'], axis=1)

# Drop any duplicate rows based on the "Organization" and "Year" columns
# This ensures that each row in the resulting dataframe corresponds to a unique organization and year combination
final_df = final_df.drop_duplicates(subset=['Organization', 'Year']).reset_index(drop=True)

# Add four new columns to the resulting dataframe with initial values of 0
final_df["Greenhouse_Gas_Emissions"] = 0
final_df["Diversity"] = 0
final_df["Employee_Health_Safety"] = 0
final_df["Customer_Welfare"] = 0

# The resulting dataframe final_df contains all the information from both the PDF files and the GRI dataframe

In [1]:
# This part contains all the different words for the multi-keyword counts over the different files
greenhouse_gas_emissions_keywords = ["greenhouse gas emission","greenhouse emission","greenhouse gas","gas emission","fossil fuels","co2","natural gas","lower emission","lowering emission","nitrous oxide","methane","carbon dioxide","paris agreement",
                                     "emision de gases de efecto invernadero","emision de gases","gas de efecto invernadero","gases de efecto invernadero","gases","combustibles fosiles","co2","emision","oxido nitroso","metano","carbono dióxido","acuerdo de paris",
                                     "treibhausgasemission","treibhausgasemission","treibhausgase","gasemission","fossile brennstoffe","co2","erdgas","emission","stickstoffoxid","methan","kohlenstoff kohlendioxid","abkommen von paris",
                                     "温室气体排放","温室气体排放","温室气体","气体排放","化石燃料","co2","天然气","降低排放","降低排放","一氧化二氮", "甲烷","二氧化碳","巴黎协定",
                                     "emissão de gases de efeito estufa","emissão de efeito estufa","gás de efeito estufa","emissão de gás","combustíveis fósseis","co2","gás natural","emissão mais baixa","emissão de redução","óxido nitroso", "metano","dióxido de carbono","acordo de paris",
                                     "εκπομπές αερίων του θερμοκηπίου", "εκπομπές θερμοκηπίου", "αέρια θερμοκηπίου", "εκπομπές αερίων", "ορυκτά καύσιμα", "co2", "φυσικό αέριο", "χαμηλότερες εκπομπές", "μείωση εκπομπών", "οξείδιο του αζώτου", "μεθάνιο", "διοξείδιο του άνθρακα", "συμφωνία του Παρισιού",
                                     "выбросы парниковых газов", "выбросы парниковых газов", "парниковые газы", "выбросы газов", "ископаемое топливо", "co2", "природный газ", "снижение выбросов", "снижение выбросов", "закись азота", "метан", "двуокись углерода", "парижское соглашение",
                                     "温室効果ガス排出量","温室効果ガス排出量","温室効果ガス","ガス排出量","化石燃料","CO2","天然ガス","排出量の削減","排出量の削減","亜酸化窒素", "メタン","二酸化炭素","パリ協定",
                                     "emissione di gas serra","emissione di gas serra","gas serra","emissione di gas","combustibili fossili","co2","gas naturale","emissione inferiore","riduzione delle emissioni","protossido di azoto", "metano","anidride carbonica","accordo di parigi",
                                     "emisi gas rumah kaca", "emisi rumah kaca", "gas rumah kaca", "emisi gas", "bahan bakar fosil", "co2", "gas alam", "emisi lebih rendah", "penurunan emisi", "nitro oksida", "metana", "karbon dioksida", "kesepakatan paris",
                                     "kasvihuonekaasupäästöt", "kasvihuonepäästöt", "kasvihuonekaasut", "kaasupäästöt", "fossiiliset polttoaineet", "CO2", "maakaasu", "pienempi päästö", "vähentävä päästö", "typpioksiduuli", "metaani", "hiilidioksidi", "pariisin sopimus",
                                     "emisja gazów cieplarnianych","emisja cieplarniana","gaz cieplarniany","emisja gazu","paliwa kopalne","co2","gaz ziemny","niższa emisja","obniżenie emisji","podtlenek azotu", "metan", "dwutlenek węgla", "porozumienie paryskie",
                                     "klimagassutslipp","klimagassutslipp","klimagass","gassutslipp","fossile brensler","co2","naturgass","lavere utslipp","senkende utslipp","nitrogenoksid", "metan", "karbondioksid", "parisavtale",
                                     "การปล่อยก๊าซเรือนกระจก","การปล่อยก๊าซเรือนกระจก","ก๊าซเรือนกระจก","การปล่อยก๊าซ","เชื้อเพลิงฟอสซิล","co2","ก๊าซธรรมชาติ","การปล่อยก๊าซที่ต่ำกว่า","การปล่อยก๊าซที่ลดลง","ไนตรัสออกไซด์", "มีเทน", "คาร์บอนไดออกไซด์", "ข้อตกลงปารีส",
                                     "växthusgasutsläpp","växthusgasutsläpp","växthusgas","gasutsläpp","fossila bränslen","co2","naturgas","lägre utsläpp","minska utsläpp","dikväveoxid", "metan","koldioxid","parisavtal",
                                     "온실 가스 배출","온실 배출","온실 가스","가스 배출","화석 연료","이산화탄소","천연 가스","배출 감소","배출 감소","아산화질소", "메탄","이산화탄소","파리 협정",
                                     "gaz à effet de serre","gaz à effet de serre","gaz à effet de serre","gaz à effet de serre","combustibles fossiles","co2","gaz naturel","baisse d'émission","baisse d'émission","protoxyde d'azote", "méthane","dioxyde de carbone","accord de paris",
                                     "emisii de gaze cu efect de seră","emisii cu efect de seră","emisii de gaze cu efect de seră","emisii de gaze","combustibili fosili","co2","gaze naturale","emisii mai mici","reducerea emisiilor","oxid de azot", "metan","dioxid de carbon","acord de la Paris",
                                     "üvegházhatású gázok kibocsátása", "üvegházhatású gázok kibocsátása", "üvegházhatású gáz", "gázkibocsátás", "fosszilis tüzelőanyagok", "co2", "földgáz", "alacsonyabb kibocsátás", "kibocsátáscsökkentés", "dinitrogén-oxid", "metán", "szén-dioxid", "párizsi megállapodás",
                                     "sera gazı emisyonu", "sera emisyonu", "sera gazı", "gaz emisyonu", "fosil yakıtlar", "co2", "doğal gaz", "düşük emisyon", "emisyonun düşürülmesi", "nitröz oksit", "metan","karbondioksit","paris anlaşması",
                                     "khí thải nhà kính","khí thải nhà kính","khí nhà kính","khí thải khí","nhiên liệu hóa thạch","co2","khí tự nhiên","giảm phát thải","giảm phát thải","nitơ oxit", "khí mê-tan","khí cacbonic","thỏa thuận Paris",
                                     "викиди парникових газів", "викиди парникових газів", "парниковий газ", "викиди газів", "викопне паливо", "co2", "природний газ", "низькі викиди", "зниження викидів", "закис азоту", "метан", "вуглекислий газ", "паризька угода",
                                     "emetimi i gazit serrë", "emetimi i serrës", "gaz serë", "emetim i gazit", "karburantet fosile", "co2", "gaz natyror", "emision më i ulët", "ulës i emetimit", "oksid azoti", "metani", "dioksidi i karbonit", "marrëveshja e Parisit",
                                     "emisie skleníkových plynov","skleníkové emisie","skleníkový plyn","emisie plynu","fosílne palivá","co2","zemný plyn","nižšie emisie", "zníženie emisií","oxid dusný", "metán", "oxid uhličitý", "parížska dohoda",
                                     "drivhusgasemission","drivhusemission","drivhusgas","gasemission","fossile brændstoffer","co2","naturgas","lavere emission","sænke emission","nitrogenoxid", "methan","kuldioxid","paris-aftale",
                                     "broeikasgasemissie","broeikasgasemissie","broeikasgas","gasemissie","fossiele brandstoffen","co2","aardgas","lagere uitstoot","verlagende uitstoot","stikstofoxide", "methaan", "kooldioxide", "akkoord van Parijs",
                                     "емисии на стакленички гасови", "стакленички гасови", "стакленички гасови", "емисија на гас", "фосилни горива", "co2", "природен гас", "пониска емисија", "намалување на емисијата", "азотен оксид", "метан", "јаглерод диоксид", "париски договор",
                                     "emissió de gasos d'efecte hivernacle","emissió de gasos d'efecte hivernacle","emissió de gasos","combustibles fòssils","co2","gas natural","menor emissió","reducció d'emissions","òxid nitrós", "metà","diòxid de carboni","acord de París"]


diversity_keywords = ["diversity","diversify","diversification","diverseness","diverse effect","increase ethnic representation","increase racial representation","lgbt+","people with disabilities","inclusive environment","representation of women","accessible to customers with physical disabilities","inclusive to customers with physical disabilities",
                      "diversidad","diversificar","diversificacion","diversidad","efecto diverso","aumentar la representacion etnica","aumentar la representacion racial","lgbt+","personas con discapacidad","ambiente inclusivo","representacion"," de mujeres","accesible a clientes con discapacidad física","inclusivo a clientes con discapacidad física",
                      "多样性","多样化","多样化","多样性","多样化效应","增加种族代表性","增加种族代表性","lgbt +","残疾人","包容性环境","代表性女性","身体残疾的顾客可以使用","包括身体残疾的顾客",
                      "diversidade","diversificar","diversificação","diversidade","efeito diverso","aumentar a representação étnica","aumentar a representação racial","lgbt+","pessoas com deficiência","ambiente inclusivo","representação feminino","acessível a clientes com deficiência física","inclusivo a clientes com deficiência física",
                      "διαφορετικότητα", "διαφοροποίηση", "διαφοροποίηση", "διαφορετικότητα", "διαφορετική επίδραση", "αύξηση εθνικής εκπροσώπησης", "αύξηση φυλετικής εκπροσώπησης", "lgbt+", "άτομα με αναπηρίες", "περιβάλλον χωρίς αποκλεισμούς", "εκπροσώπηση των γυναικών", "προσβάσιμο σε πελάτες με σωματικές αναπηρίες", "συμπεριλαμβανομένων των πελατών με σωματικές αναπηρίες",
                      "разнообразие", "разнообразие", "разнообразие", "разнообразие", "эффект разнообразия", "увеличение этнического представительства", "увеличение расового представительства", "лгбт+", "люди с ограниченными возможностями", "инклюзивная среда", "представление женщин", "доступный для клиентов с ограниченными физическими возможностями", "включая клиентов с ограниченными физическими возможностями",
                      "多様性","多様化","多様化","多様性","多様な効果","民族的代表の増加","人種的代表の増加","lgbt+","障害者","包括的な環境","代表女性専用","お身体の不自由なお客様もご利用いただけます","お身体の不自由なお客様もご利用いただけます",
                      "diversità","diversificare","diversificazione","diversità","effetto diverso","aumentare la rappresentanza etnica","aumentare la rappresentanza razziale","lgbt+","persone con disabilità","ambiente inclusivo","rappresentazione"," delle donne","accessibile ai clienti con disabilità fisiche","incluso ai clienti con disabilità fisiche",
                      "keragaman", "diversifikasi", "diversifikasi", "keanekaragaman", "efek beragam", "meningkatkan representasi etnis", "meningkatkan representasi rasial", "lgbt+", "penyandang disabilitas", "lingkungan inklusif", "representasi wanita", "dapat diakses oleh pelanggan dengan disabilitas fisik", "termasuk untuk pelanggan dengan disabilitas fisik",
                      "monimuotoisuus", "monimuotoistaa", "monimuotoistaminen", "monimuotoisuus", "monimuotoinen vaikutus", "lisää etnistä edustusta", "lisää rodullista edustusta","lgbt+", "vammaiset", "kattava ympäristö", "edustus naisten","fyysisesti vammaisten asiakkaiden saavutettavissa","fyysisesti vammaisille asiakkaille",
                      "diversity","diversify","diversification","diversity","diversity effect","ethnische Vertretung erhöhen","rassische Vertretung erhöhen","lgbt+","Menschen mit Behinderungen","inklusive Umgebung","repräsentation von Frauen","zugänglich für Kunden mit körperlichen Einschränkungen","inklusive für Kunden mit körperlichen Einschränkungen",
                      "różnorodność","zróżnicowanie","zróżnicowanie","różnorodność","zróżnicowany efekt","zwiększenie reprezentacji etnicznej","zwiększenie reprezentacji rasowej","LGBT+","osób niepełnosprawnych","środowisko integracyjne","reprezentacja kobiet", "dostępny dla klientów z niepełnosprawnością ruchową", "włącznie dla klientów z niepełnosprawnością ruchową",
                      "mangfold","diversifisere","diversifisering","mangfold","mangfoldig effekt","øke etnisk representasjon","øke raserepresentasjon","lgbt+","personer med funksjonshemninger","inkluderende miljø","representasjon av kvinner","tilgjengelig for kunder med fysiske funksjonshemninger","inkludert for kunder med fysiske funksjonshemninger",
                      "ความหลากหลาย","กระจายความหลากหลาย","ความหลากหลาย","ความหลากหลาย","ผลกระทบที่หลากหลาย","เพิ่มการเป็นตัวแทนทางชาติพันธุ์","เพิ่มการเป็นตัวแทนทางเชื้อชาติ","lgbt+","คนที่มีความพิการ","สภาพแวดล้อมที่ครอบคลุม","การเป็นตัวแทน ของผู้หญิง","เข้าถึงลูกค้าที่มีความพิการทางร่างกาย","รวมถึงลูกค้าที่มีความพิการทางร่างกาย",
                      "mångfald","diversifiera","diversifiering","mångfald","mångsidig effekt","öka etnisk representation","öka rasrepresentation","hbt+","personer med funktionsnedsättning","inkluderande miljö","representation av kvinnor","tillgänglig för kunder med fysiska funktionshinder","inklusive för kunder med fysiska funktionshinder",
                      "다양성", "다양화", "다양화", "다양성", "다양성 효과", "인종적 표현 증가", "인종적 표현 증가", "장애인", "포괄적인 환경", '신체장애가 있는 고객도 이용 가능','신체장애가 있는 고객도 이용 가능',
                      "diversité","diversifier","diversification","diversité","effet divers","augmenter la représentation ethnique","augmenter la représentation raciale","lgbt+","personnes handicapées","environnement inclusif","représentation de femmes","accessible aux clients handicapés physiques","inclusif aux clients handicapés physiques",
                      "diversitate","diversificare","diversificare","diversitate","efect divers","creşterea reprezentării etnice","creşterea reprezentării rasiale","lgbt+","persoane cu dizabilităţi","mediu incluziv","reprezentare de femei","accesibil clienților cu dizabilități fizice","inclusiv clienților cu dizabilități fizice",
                      "sokszínűség", "diverzifikáció", "diverzifikáció", "sokszínűség", "sokszínű hatás", "nemzeti reprezentáció növelése", "faji reprezentáció növelése", "lgbt+", "fogyatékos emberek", "befogadó környezet", "reprezentáció nőké","mozgássérült ügyfelek számára hozzáférhető", "testi fogyatékkal élő ügyfelek számára is",
                      "çeşitlilik","çeşitlendirmek","çeşitlilik","çeşitlilik","çeşitlilik etkisi","etnik temsili artırmak","ırksal temsili artırmak","lgbt+","engelli insanlar","kapsayıcı ortam","temsil kadın,","fiziksel engelli müşteriler için erişilebilir","fiziksel engelli müşteriler için kapsayıcı",
                      "sự đa dạng","đa dạng hóa","đa dạng hóa","sự đa dạng","hiệu ứng đa dạng","tăng đại diện sắc tộc","tăng đại diện chủng tộc","lgbt+","người khuyết tật","môi trường hòa nhập","sự đại diện của phụ nữ","có thể tiếp cận với khách hàng bị khuyết tật về thể chất","bao gồm khách hàng bị khuyết tật về thể chất",
                      "різноманітність", "урізноманітнити", "диверсифікація", "різноманітність", "ефект різноманітності", "збільшити етнічне представництво", "збільшити расове представництво", "ЛГБТ+", "люди з обмеженими можливостями", "інклюзивне середовище", "представництво жінок","доступний для клієнтів з обмеженими фізичними можливостями","включно для клієнтів з обмеженими фізичними можливостями",
                      "diversitet", "diversifikoj", "diversifikim", "diversitet", "efekt i larmishëm", "rritja e përfaqësimit etnik", "rritja e përfaqësimit racor", "lgbt+", "personat me aftësi të kufizuara", "mjedis gjithëpërfshirës", "përfaqësimi të grave", "të aksesueshëm për klientët me aftësi të kufizuara fizike", "përfshirë klientët me aftësi të kufizuara fizike",
                      "diverzita","diverzifikovať","diverzifikácia","diverzita","diverzný efekt","zvýšiť etnické zastúpenie","zvýšiť rasové zastúpenie","lgbt+","ľudia so zdravotným postihnutím","inkluzívne prostredie","zastúpenie žien","prístupné pre zákazníkov s telesným postihnutím","vrátane zákazníkov s telesným postihnutím",
                      "diversitet","diversificere","diversificering","mangfoldighed","diversitet","øge etnisk repræsentation","øge racerepræsentation","lgbt+","personer med handicap","inklusivt miljø","repræsentation af kvinder","tilgængelig for kunder med fysiske handicap","inklusive for kunder med fysiske handicap",
                      "diversiteit","diversificatie","diversificatie","diversiteit","divers effect","verhogen etnische vertegenwoordiging","verhogen raciale vertegenwoordiging","lhbt+","mensen met een handicap","inclusieve omgeving","vertegenwoordiging van vrouwen","toegankelijk voor klanten met een fysieke handicap","inclusief voor klanten met een fysieke handicap",
                      "различност", "диверзифицира", "диверзификација", "разновидност", "разновиден ефект", "зголемување на етничката застапеност", "зголемување на расната застапеност", "лгбт+", "луѓе со попреченост", "инклузивна средина", "застапеност на жени", "достапни за клиенти со физички инвалидитет", "вклучително и за клиенти со телесен инвалидитет",
                      "diversitat","diversitat","diversificació","diversitat","efecte divers","augmentar la representació ètnica","augmentar la representació racial","lgbt+","persones amb discapacitat","entorn inclusiu","representació de dones","accessible per a clients amb discapacitat física","incloent clients amb discapacitat física"]


employee_health_safety_keywords = ["employee health","employee safety","health and safety","health & safety","safe working environment","employee well-being","osha","safe work environment","safety measures","health measures","employees' well-being","employee pension",
                                   "salud de los empleados","seguridad de los empleados","salud y seguridad","salud y seguridad","entorno de trabajo seguro","bienestar de los empleados","osha","entorno de trabajo seguro","medidas de seguridad", "medidas de salud","bienestar de los empleados","pensión de los empleados",
                                   "员工健康","员工安全","健康与安全","健康与安全","安全工作环境","员工福利","osha","安全工作环境","安全措施", "健康措施","员工福利","员工养老金",
                                   "saúde do funcionário","segurança do funcionário","saúde e segurança","saúde e segurança","ambiente de trabalho seguro","bem-estar do funcionário","osha","ambiente de trabalho seguro","medidas de segurança", "medidas de saúde","bem-estar dos trabalhadores","pensão dos trabalhadores",
                                   "υγεία των εργαζομένων", "ασφάλεια εργαζομένων", "υγεία και ασφάλεια", "υγεία και ασφάλεια", "ασφαλές εργασιακό περιβάλλον", "ευημερία των εργαζομένων", "osha", "ασφαλές εργασιακό περιβάλλον", "μέτρα ασφαλείας", "μέτρα υγείας", "ευημερία των εργαζομένων", "συνταξιοδότηση εργαζομένων",
                                   "здоровье сотрудников", "безопасность сотрудников", "здоровье и безопасность", "здоровье и безопасность", "безопасная рабочая среда", "благополучие сотрудников", "оша", "безопасная рабочая среда", "меры безопасности", "медицинские меры", "благополучие сотрудников", "пенсии работникам",
                                   "従業員の健康","従業員の安全","健康と安全","健康と安全","安全な職場環境","従業員の福利厚生","osha","安全な職場環境","安全対策", "健康対策""福利厚生""厚生年金"
                                   "salute dei dipendenti","sicurezza dei dipendenti","salute e sicurezza","salute e sicurezza","ambiente di lavoro sicuro","benessere dei dipendenti","osha","ambiente di lavoro sicuro","misure di sicurezza", "misure sanitarie","benessere dei dipendenti","pensione dei dipendenti",
                                   "kesehatan karyawan", "keselamatan karyawan", "kesehatan dan keselamatan", "kesehatan & keselamatan", "lingkungan kerja yang aman", "kesejahteraan karyawan", "osha", "lingkungan kerja yang aman", "tindakan keselamatan", "tindakan kesehatan", "kesejahteraan karyawan", "pensiun karyawan",
                                   "työntekijöiden terveys", "työntekijöiden turvallisuus", "terveys ja turvallisuus", "terveys ja turvallisuus", "turvallinen työympäristö", "työntekijöiden hyvinvointi", "osha", "turvallinen työympäristö", "turvallisuustoimenpiteet", "terveystoimenpiteet", "työntekijöiden hyvinvointi", "työeläke",
                                   "Gesundheit der Mitarbeiter", "Sicherheit der Mitarbeiter", "Gesundheit und Sicherheit", "Gesundheit und Sicherheit", "sicheres Arbeitsumfeld", "Wohlbefinden der Mitarbeiter", "osha", "sicheres Arbeitsumfeld", "Gesundheitsmaßnahmen", "Wohlbefinden der Mitarbeiter", "Mitarbeitervorsorge",
                                   "zdrowie pracownika","bezpieczeństwo pracownika","zdrowie i bezpieczeństwo","zdrowie i bezpieczeństwo","bezpieczne środowisko pracy","dobre samopoczucie pracownika","osha","bezpieczne środowisko pracy","środki bezpieczeństwa", "środki zdrowotne", "dobre samopoczucie pracowników", "emerytura pracownicza",
                                   "ansattes helse","ansattes sikkerhet","helse og sikkerhet","helse og sikkerhet","trygt arbeidsmiljø","ansattes velvære","osha","trygt arbeidsmiljø","sikkerhetstiltak", "helsetiltak","ansattes trivsel","arbeidstakers pensjon",
                                   "สุขภาพของพนักงาน", "ความปลอดภัยของพนักงาน", "สุขภาพและความปลอดภัย", "สุขภาพและความปลอดภัย", "สภาพแวดล้อมการทำงานที่ปลอดภัย", "ความเป็นอยู่ที่ดีของพนักงาน", "osha", "สภาพแวดล้อมการทำงานที่ปลอดภัย", "มาตรการความปลอดภัย", "มาตรการด้านสุขภาพ", "ความเป็นอยู่ที่ดีของพนักงาน", "เงินบำนาญพนักงาน",
                                   "anställdas hälsa","anställdas säkerhet","hälsa och säkerhet","hälsa och säkerhet","säker arbetsmiljö","anställdas välbefinnande","osha","säker arbetsmiljö","säkerhetsåtgärder", "hälsoåtgärder","anställdas välbefinnande","arbetstagares pension",
                                   "직원 건강","직원 안전","건강과 안전","건강과 안전","안전한 작업 환경","직원 복지","osha","안전한 작업 환경","안전 조치", "건강 대책","근로자 복지","근로자 연금",
                                   "santé des employés","sécurité des employés","santé et sécurité","santé et sécurité","environnement de travail sûr","bien-être des employés","osha","environnement de travail sûr","mesures de sécurité", "mesures sanitaires","bien-être des salariés","retraite des salariés",
                                   "sănătatea angajaților","siguranța angajaților","sănătate și siguranță","sănătate și siguranță","mediu de lucru sigur","bunăstarea angajaților","osha","mediu de lucru sigur","măsuri de siguranță", "măsuri de sănătate","bunăstarea angajaților","pensia angajaților",
                                   "munkavállalók egészsége", "munkavállalók biztonsága", "egészség és biztonság", "egészségügy és biztonság", "biztonságos munkakörnyezet", "munkavállalói jólét", "osha", "biztonságos munkakörnyezet", "biztonsági intézkedések", "egészségügyi intézkedések", "munkavállalók jóléte", "munkavállalói nyugdíj",
                                   "çalışan sağlığı", "çalışan güvenliği", "sağlık ve güvenlik", "sağlık ve güvenlik", "güvenli çalışma ortamı", "çalışanların refahı", "osha", "güvenli çalışma ortamı", "güvenlik önlemleri", "sağlık önlemleri", "çalışanların refahı", "çalışan emekli maaşı",
                                   "sức khỏe của nhân viên","an toàn của nhân viên","sức khỏe và an toàn","sức khỏe & an toàn","môi trường làm việc an toàn","hạnh phúc của nhân viên","osha","môi trường làm việc an toàn","các biện pháp an toàn", "các biện pháp sức khỏe","hạnh phúc của nhân viên","lương hưu của nhân viên",
                                   "здоров'я співробітників", "безпека співробітників", "здоров'я та безпека", "здоров'я та безпека", "безпечне робоче середовище", "благополуччя співробітників", "osha", "безпечне робоче середовище", "заходи безпеки", "заходи охорони здоров'я", "благополуччя працівників", "пенсія працівників",
                                   "shëndeti i punonjësve", "siguria e punonjësve", "shëndeti dhe siguria", "shëndeti dhe siguria", "mjedisi i sigurt i punës", "mirëqenia e punonjësve", "osha", "mjedis i sigurt i punës", "masat e sigurisë", "masat shëndetësore", "mirëqenia e punonjësve", "pensioni i punonjësve",
                                   "zdravie zamestnancov", "bezpečnosť zamestnancov", "zdravie a bezpečnosť", "zdravie a bezpečnosť", "bezpečné pracovné prostredie", "pohoda zamestnancov", "osha", "bezpečné pracovné prostredie", "bezpečnostné opatrenia", "zdravotné opatrenia", "blahobyt zamestnancov", "zamestnanecký dôchodok",
                                   "medarbejdernes sundhed","medarbejdernes sikkerhed","sundhed og sikkerhed","sundhed og sikkerhed","sikkert arbejdsmiljø","medarbejdernes trivsel","osha","sikkert arbejdsmiljø","sikkerhedsforanstaltninger", "sundhedsforanstaltninger","medarbejdernes trivsel","medarbejderpension",
                                   "gezondheid van werknemers","veiligheid van werknemers","gezondheid en veiligheid","gezondheid en veiligheid","veilige werkomgeving","welzijn van werknemers","osha","veilige werkomgeving","veiligheidsmaatregelen", "gezondheidsmaatregelen","welzijn van werknemers","pensioen van werknemers",
                                   "здравје на вработените", "безбедност на вработените", "здравје и безбедност", "здравје и безбедност", "безбедна работна средина", "благосостојба на вработените", "оша", "безбедна работна средина", "безбедни мерки", "здравствени мерки", "благосостојба на вработените", "работничка пензија",
                                   "salut dels empleats","seguretat dels empleats","salut i seguretat","salut i seguretat","entorn laboral segur","benestar dels empleats","osha","entorn laboral segur","mesures de seguretat", "mesures sanitàries","benestar dels empleats","pensió dels empleats"]


customer_welfare_keywords = ["customer welfare","consumer welfare","social welfare","human progress","materials usage","material usage","higher privacy","promoting innovation","impact on customers","impact on consumers","impact on customer","impact on consumer",
                             "bienestar del cliente","bienestar del consumidor","bienestar social","progreso humano","uso de materiales","uso de materiales","mayor privacidad","promoción de la innovación","impacto en los clientes","impacto en los consumidores ","impacto en el cliente","impacto en el consumidor",
                             "客户福利","消费者福利","社会福利","人类进步","材料使用","材料使用","更高的隐私","促进创新","对客户的影响","对消费者的影响","对客户的影响","对消费者的影响",
                             "bem-estar do cliente","bem-estar do consumidor","bem-estar social","progresso humano","uso de materiais","uso de materiais","maior privacidade","promoção da inovação","impacto nos clientes","impacto nos consumidores ","impacto no cliente","impacto no consumidor",
                             "ευημερία των πελατών", "ευημερία των καταναλωτών", "κοινωνική ευημερία", "ανθρώπινη πρόοδος", "χρήση υλικών", "χρήση υλικού", "υψηλότερη προστασία της ιδιωτικής ζωής", "προώθηση της καινοτομίας", "επίπτωση στους πελάτες", "επίδραση στους καταναλωτές ""επίδραση στον πελάτη", "επίδραση στον καταναλωτή",
                             "благосостояние клиентов", "благосостояние потребителей", "социальное благосостояние", "человеческий прогресс", "использование материалов", "использование материалов", "повышение конфиденциальности", "продвижение инноваций", "воздействие на клиентов", "воздействие на потребителей","воздействие на клиента","воздействие на потребителя",
                             "顧客福祉""消費者福祉""社会福祉""人類の進歩""資材の使用""資材の使用""プライバシーの向上""イノベーションの促進""顧客への影響""消費者への影響","顧客への影響","消費者への影響",
                             "benessere del cliente","benessere del consumatore","benessere sociale","progresso umano","uso dei materiali","uso dei materiali","maggiore privacy","promozione dell'innovazione","impatto sui clienti","impatto sui consumatori ","impatto sul cliente","impatto sul consumatore",
                             "kesejahteraan pelanggan", "kesejahteraan konsumen", "kesejahteraan sosial", "kemajuan manusia", "penggunaan material", "penggunaan material", "privasi yang lebih tinggi", "mempromosikan inovasi", "dampak pada pelanggan", "dampak pada konsumen ","dampak pada pelanggan","dampak pada konsumen",
                             "asiakkaan hyvinvointi", "kuluttajien hyvinvointi", "sosiaalinen hyvinvointi", "inhimillinen kehitys", "materiaalien käyttö", "materiaalin käyttö", "korkeampi yksityisyys", "innovaatioiden edistäminen", "vaikutus asiakkaisiin", "vaikutus kuluttajiin", "vaikutus asiakkaaseen","vaikutus kuluttajaan",
                             "Kundenwohl","Verbraucherwohl","Soziales Wohlergehen","Menschlicher Fortschritt","Materialverbrauch","Materialverbrauch","höhere Privatsphäre","Förderung von Innovationen","Auswirkungen auf Kunden","Auswirkungen auf Verbraucher ","Auswirkung auf Kunden","Auswirkung auf Verbraucher",
                             "dobro klienta", "dobro konsumenta", "dobro społeczne", "postęp człowieka", "wykorzystanie materiałów", "wykorzystanie materiałów", "wyższa prywatność", "promowanie innowacji", "wpływ na klientów", "wpływ na konsumentów","wpływ na klienta","wpływ na konsumenta",
                             "kundevelferd", "forbrukervelferd", "sosial velferd", "menneskelig fremgang", "materialbruk", "materialbruk", "høyere personvern", "fremme innovasjon", "påvirkning på kunder", "påvirkning på forbrukere","påvirkning på kunde","påvirkning på forbruker",
                             "สวัสดิการลูกค้า" "สวัสดิการผู้บริโภค" "สวัสดิการสังคม" "ความก้าวหน้าของมนุษย์" "การใช้วัสดุ" "การใช้วัสดุ" "ความเป็นส่วนตัวที่สูงขึ้น" "การส่งเสริมนวัตกรรม" "ผลกระทบต่อลูกค้า" "ผลกระทบต่อผู้บริโภค ","ผลกระทบต่อลูกค้า","ผลกระทบต่อผู้บริโภค",
                             "kundvälfärd","konsumentvälfärd","social välfärd","mänskliga framsteg","materialanvändning","materialanvändning","högre integritet","främjar innovation","påverkan på kunder","påverkan på konsumenter ","påverkan på kunden","påverkan på konsumenten",
                             "고객 복지", "소비자 복지", "사회 복지", "인간 진보", "재료 사용", "재료 사용", "높은 프라이버시", "혁신 촉진", "고객에 대한 영향", "소비자에 대한 영향","고객에게 미치는 영향","소비자에게 미치는 영향",
                             "bien-être des clients","bien-être des consommateurs","bien-être social","progrès humain","utilisation des matériaux","utilisation des matériaux","meilleure confidentialité","promotion de l'innovation","impact sur les clients","impact sur les consommateurs ","impact sur le client","impact sur le consommateur",
                             "bunăstarea clienților","bunăstarea consumatorului","bunăstarea socială","progresul uman","utilizarea materialelor","folosirea materialelor","confidențialitate mai ridicată","promovarea inovației","impactul asupra clienților","impactul asupra consumatorilor ","impactul asupra clientului","impactul asupra consumatorului",
                             "ügyféljólét", "fogyasztói jólét", "szociális jólét", "emberi fejlődés", "anyaghasználat", "anyaghasználat", "magasabb adatvédelem", "innováció előmozdítása", "vevőkre gyakorolt hatás", "fogyasztókra gyakorolt hatás","hatás az ügyfélre","hatás a fogyasztóra",
                             "müşteri refahı", "tüketici refahı", "sosyal refah", "insan gelişimi", "malzeme kullanımı", "materyal kullanımı", "yüksek gizlilik", "yeniliğin teşvik edilmesi", "müşteriler üzerindeki etki", "tüketiciler üzerindeki etki","müşteri üzerindeki etki","tüketici üzerindeki etki",
                             "phúc lợi khách hàng","phúc lợi người tiêu dùng","phúc lợi xã hội","tiến bộ con người","sử dụng vật liệu","sử dụng vật liệu","bảo mật cao hơn","thúc đẩy đổi mới","tác động đến khách hàng","tác động đến người tiêu dùng ""tác động đến khách hàng","tác động đến người tiêu dùng",
                             "благополуччя клієнтів", "благополуччя споживачів", "соціальний добробут", "людський прогрес", "використання матеріалів", "використання матеріалів", "вища конфіденційність", "сприяння інноваціям", "вплив на клієнтів", "вплив на споживачів","вплив на клієнта","вплив на споживача",
                             "mirëqenia e klientit", "mirëqenia e konsumatorit", "mirëqenia sociale", "progresi njerëzor", "përdorimi i materialeve", "përdorimi i materialit", "privatësia më e lartë", "promovimi i inovacionit", "ndikimi te klientët", "ndikimi tek konsumatorët ""ndikimi tek klienti","ndikimi tek konsumatori",
                             "blaho zákazníka", "spotrebiteľský blahobyt", "sociálny blahobyt", "ľudský pokrok", "využitie materiálov", "využitie materiálu", "vyššie súkromie", "podpora inovácií", "vplyv na zákazníkov", "vplyv na spotrebiteľov","vplyv na zákazníka", "vplyv na spotrebiteľa",
                             "kundevelfærd","forbrugervelfærd","social velfærd","menneskelige fremskridt","materialeforbrug","materialeforbrug","højere privatliv","fremme innovation","påvirkning af kunder","påvirkning af forbrugere ","påvirkning af kunde","påvirkning på forbruger",
                             "welzijn van klanten","welzijn van consumenten","sociaal welzijn","menselijke vooruitgang","materiaalgebruik","materiaalgebruik","hogere privacy","bevordering van innovatie","impact op klanten","impact op consumenten ","impact op klant","impact op consument",
                             "благосостојба на клиентите", "благосостојба на потрошувачите", "социјална благосостојба", "човечки напредок", "употреба на материјали", "користење материјали", "повисока приватност", "промовирање на иновации", "влијание врз клиентите", "влијание врз потрошувачите","влијание врз клиентот", "влијание врз потрошувачот",
                             "benestar del client","benestar del consumidor","benestar social","progrés humà","ús de materials","ús de materials","més privadesa","foment de la innovació","impacte en els clients","impacte en els consumidors ","impacte en el client","impacte en el consumidor"]

In [None]:
# import the PdfReader module from PyPDF2
from PyPDF2 import PdfReader

# define a function to get the unique words from a list of words
def getUniqueWords(allWords):
    uniqueWords = [] 
    for i in allWords:
        # if the word is not already in the uniqueWords list, add it
        if not i in uniqueWords:
            uniqueWords.append(i)
    return uniqueWords

# initialize the index variable to 0
index = 0

# loop through each file path in the fullFilePaths list
for filePath in fullFilePaths:
    # initialize the count variables to 0 for each keyword
    greenhouse_gas_emissions_count = 0
    diversity_count = 0
    employee_health_safety_count = 0
    customer_welfare_count = 0
    
    # print the index every 10 iterations (just for progress tracking)
    if index % 10 == 0:
        print(index)
    
    try:
        # create a PdfReader object for the current file path
        reader = PdfReader(filePath)
        
        # initialize an empty string variable to hold the text from the pages
        text = ""

        # define the range of pages to extract text from (in this case, all pages)
        first_page = round(len(reader.pages) * 0.0)
        last_page = round(len(reader.pages) * 1.0)

        # loop through each page in the range and extract the text
        for page_number in range(first_page, last_page):
            page = reader.pages[page_number]
            page_text = page.extract_text().lower()
            text += page_text
        
        # loop through each unique keyword for each category and check if it appears in the text
        for substring in getUniqueWords(greenhouse_gas_emissions_keywords):
            if substring.lower() in text:
                greenhouse_gas_emissions_count += 1

        for substring in getUniqueWords(diversity_keywords):
            if substring.lower() in text:
                diversity_count += 1
                
        for substring in getUniqueWords(employee_health_safety_keywords):
            if substring.lower() in text:
                employee_health_safety_count += 1
        
        for substring in getUniqueWords(customer_welfare_keywords):
            if substring.lower() in text:
                customer_welfare_count += 1       
    except:
        # if an exception occurs (such as a corrupt PDF), set the counts to NaN
        print("An exception occurred")
        greenhouse_gas_emissions_count = float("NAN")
        diversity_count = float("NAN")
        employee_health_safety_count = float("NAN")
        customer_welfare_count = float("NAN")
    
    # update the final_df DataFrame with the counts for each keyword category
    final_df.at[index, "Greenhouse_Gas_Emissions"] = greenhouse_gas_emissions_count
    final_df.at[index, "Diversity"] = diversity_count
    final_df.at[index, "Employee_Health_Safety"] = employee_health_safety_count
    final_df.at[index, "Customer_Welfare"] = customer_welfare_count
    
    # increment the index variable
    index += 1

In [8]:
# Define the file location to which the final dataframe will be exported
file_location = r'<EXPORT FILE LOCATION>'

# Export the final dataframe to the defined file location as a CSV file, without the index column
final_df.to_csv(file_location, index=False)