In [3]:
# Importing necessary libraries
import sys, os

# Setting the root directory as a string.
root = r'<ROOT LOCATION OF REPORT FOLDER>'

# Combining the root directory with the target directory to create the full path
path = os.path.join(root, "targetdirectory")

# Create empty lists to store the full file paths and file names
fullFilePaths = []
fileNames = []

# Walk through all the subdirectories and files within the root directory
for path, subdirs, files in os.walk(root):
    # Loop through all the files in each subdirectory
    for name in files:
        # Create the full file path by joining the path and file name
        fullFilePaths.append(os.path.join(path, name))

# The final output will be the full file paths of all the files within the specified root directory.

In [4]:
# Import the pandas library to work with dataframes
import pandas as pd

# Specify the filepath of the GRI excel file using a raw string to prevent backslash escaping
griFullFileName = r'<GRI FILE LOCATION>'

# Open the GRI excel file using pd.ExcelFile() and save it to the variable xls
xls = pd.ExcelFile(griFullFileName)

# Create an empty dictionary to store the dataframes
fullGRI = {}

# Loop through each sheet in the Excel file except for the "Overview" sheet
# and read the sheet into a dataframe using pd.read_excel()
# Add each dataframe to the fullGRI dictionary with the sheet name as the key
for sheet_name in xls.sheet_names[1:21]:
    fullGRI[sheet_name] = pd.read_excel(griFullFileName, sheet_name = sheet_name, header=1)

# Read the "Overview" sheet into a dataframe separately and add it to the fullGRI dictionary with the key "Overview"
fullGRI["Overview"] = pd.read_excel(griFullFileName, sheet_name = "Overview")

# Create a new dataframe named total_df by concatenating all dataframes in the fullGRI dictionary except for the "Overview" sheet
# Set ignore_index argument to True to create a new index for the concatenated dataframe
total_df = fullGRI["1999"]
for sheet_name in xls.sheet_names[2:21]:
    total_df = pd.concat([total_df, fullGRI[sheet_name]], ignore_index=True)

# Modify the 'Name' column in total_df by removing all spaces using the str.replace() method
total_df['Name'] = total_df['Name'].str.replace(' ', '')

# The resulting concatenated dataframe total_df contains all GRI data from 1999 to 2018

In [5]:
# Import the regular expressions library to work with patterns
import re

# Create an empty dataframe with two columns named "Organization" and "Year"
df = pd.DataFrame(columns=["Organization", "Year"])

# Loop through each file path in the fullFilePaths list
for filePath in fullFilePaths:
    # Extract the file name without extension from the file path
    fileName = os.path.splitext(os.path.split(filePath)[1])[0]

    # Create a list of two items by splitting the file name using "_" as a separator
    # The first item is the organization name, and the second item is the year
    row = [fileName.split("_")[0], fileName.split("_")[-1]]

    # Add the row to the dataframe at the next available index using df.loc[]
    df.loc[len(df)] = row

# Convert the "Year" column to numeric values using pd.to_numeric()
df['Year'] = pd.to_numeric(df['Year'])

# The resulting dataframe df contains information about the PDF files, including the organization name and year

In [6]:
# INPUT: GRI DATAFRAME and PDF FILES DATAFRAME
import numpy as np

# merge the two dataframes based on 'Organization' and 'Year' columns
almost_df = pd.merge(df, total_df,  how='left', left_on=['Organization','Year'], right_on = ['Name','Publication Year'])

# drop the 'Name' and 'Publication Year' columns from the merged dataframe
final_df = almost_df.drop(['Name','Publication Year'], axis=1)

# drop duplicate rows based on 'Organization' and 'Year' columns
final_df = final_df.drop_duplicates(subset=['Organization', 'Year']).reset_index(drop=True)

# add a new column called "Language" with NaN values
final_df["Language"] = np.nan

# OUTPUT: ONE FULL MERGED DATAFRAME WITH ALL THE PDF FILES AND GRI INFORMATION

In [None]:
# Imports
from PyPDF2 import PdfReader
from langdetect import detect

# INITIAL VARIABLES
file_number = 0 # count of the current file being processed
index = 0 # index for final_df dataframe

# PROCESSING PDF FILES
for filePath in fullFilePaths: # iterating through the list of file paths
    if index % 10 == 0: # printing the index every 10 files to help track progress
        print(index)
    
    try:  
        # using try-except block to catch exceptions while processing pdf files
        # creating a pdf reader object and empty text variable
        reader = PdfReader(filePath)
        text = ""
        first_page = round(len(reader.pages) * 0.5)  # extracting text from first half of pages
        last_page = round(len(reader.pages) * 0.75)  # extracting text from second half of pages

        for page_number in range(first_page, last_page):  # iterating through selected pages
            page = reader.pages[page_number]
            page_text = page.extract_text().lower()  # extracting text and converting to lowercase
            text += page_text  # appending page text to the complete text variable

        language = detect(text)  # detecting the language of the extracted text
    except:
        print("An exception occurred")
        language = "error"  # assigning error to the language variable in case of exception

    final_df.at[index, "Language"] = language  # assigning detected language to the language column of final_df
    index += 1  # incrementing index for next iteration

#OUTPUT: final_df with language column added.

In [20]:
# Define the file location to which the final dataframe will be exported
file_location = r'<EXPORT FILE LOCATION>'

# Export the final dataframe to the defined file location as a CSV file, without the index column
final_df.to_csv(file_location, index=False)