In [1]:
# Importing necessary libraries
import sys, os

# Setting the root directory as a string.
root = r'<ROOT LOCATION OF REPORT FOLDER>'

# Combining the root directory with the target directory to create the full path
path = os.path.join(root, "targetdirectory")

# Create empty lists to store the full file paths and file names
fullFilePaths = []
fileNames = []

# Walk through all the subdirectories and files within the root directory
for path, subdirs, files in os.walk(root):
    # Loop through all the files in each subdirectory
    for name in files:
        # Create the full file path by joining the path and file name
        fullFilePaths.append(os.path.join(path, name))

# The final output will be the full file paths of all the files within the specified root directory.

In [2]:
# Import the pandas library to work with dataframes
import pandas as pd

# Specify the filepath of the GRI excel file using a raw string to prevent backslash escaping
griFullFileName = r'<GRI FILE LOCATION>'

# Open the GRI excel file using pd.ExcelFile() and save it to the variable xls
xls = pd.ExcelFile(griFullFileName)

# Create an empty dictionary to store the dataframes
fullGRI = {}

# Loop through each sheet in the Excel file except for the "Overview" sheet
# and read the sheet into a dataframe using pd.read_excel()
# Add each dataframe to the fullGRI dictionary with the sheet name as the key
for sheet_name in xls.sheet_names[1:21]:
    fullGRI[sheet_name] = pd.read_excel(griFullFileName, sheet_name = sheet_name, header=1)

# Read the "Overview" sheet into a dataframe separately and add it to the fullGRI dictionary with the key "Overview"
fullGRI["Overview"] = pd.read_excel(griFullFileName, sheet_name = "Overview")

# Create a new dataframe named total_df by concatenating all dataframes in the fullGRI dictionary except for the "Overview" sheet
# Set ignore_index argument to True to create a new index for the concatenated dataframe
total_df = fullGRI["1999"]
for sheet_name in xls.sheet_names[2:21]:
    total_df = pd.concat([total_df, fullGRI[sheet_name]], ignore_index=True)

# Modify the 'Name' column in total_df by removing all spaces using the str.replace() method
total_df['Name'] = total_df['Name'].str.replace(' ', '')

# The resulting concatenated dataframe total_df contains all GRI data from 1999 to 2018

In [3]:
# Import the regular expressions library to work with patterns
import re

# Create an empty dataframe with two columns named "Organization" and "Year"
df = pd.DataFrame(columns=["Organization", "Year"])

# Loop through each file path in the fullFilePaths list
for filePath in fullFilePaths:
    # Extract the file name without extension from the file path
    fileName = os.path.splitext(os.path.split(filePath)[1])[0]

    # Create a list of two items by splitting the file name using "_" as a separator
    # The first item is the organization name, and the second item is the year
    row = [fileName.split("_")[0], fileName.split("_")[-1]]

    # Add the row to the dataframe at the next available index using df.loc[]
    df.loc[len(df)] = row

# Convert the "Year" column to numeric values using pd.to_numeric()
df['Year'] = pd.to_numeric(df['Year'])

# The resulting dataframe df contains information about the PDF files, including the organization name and year

In [32]:
import numpy as np

# Merge the two dataframes by left join using common columns 'Organization' and 'Year'
almost_df = pd.merge(df, total_df,  how='left', left_on=['Organization','Year'], right_on = ['Name','Publication Year'])

# Drop the columns 'Name' and 'Publication Year' from the merged dataframe
final_df = almost_df.drop(['Name','Publication Year'], axis=1)

# Remove duplicates in the columns 'Organization' and 'Year' and reset the index
final_df = final_df.drop_duplicates(subset=['Organization', 'Year']).reset_index(drop=True)

# Add two new columns 'Overall sentiment' and 'sentiment' to the final dataframe with initial values of NaN
final_df["Overall sentiment"] = np.nan
final_df["sentiment"] = np.nan

# The purpose of the new columns is not clear from this code alone, but it is likely that they will be used to store sentiment analysis results based on the text in the PDF files. 
# The resulting dataframe 'final_df' is the fully merged dataframe with all the PDF files and GRI information.

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from PyPDF2 import PdfReader
from langdetect import detect

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Initialize an index variable to keep track of the current row in the dataframe
index = 0

# Loop through each file path in the list of full file paths
for filePath in fullFilePaths:

    # Print the current index every 10 files for progress monitoring
    if index % 10 == 0:
        print(index)

    try:
        # Create a PdfReader object and initialize an empty text variable
        reader = PdfReader(filePath)
        text = ""

        # Extract text from a selection of pages (from 35% to 65% of the total pages)
        first_page = round(len(reader.pages) * 0.35)
        last_page = round(len(reader.pages) * 0.65)
        for page_number in range(first_page, last_page):
            page = reader.pages[page_number]
            page_text = page.extract_text()
            text += page_text

        # Analyze the sentiment of the extracted text using VADER sentiment analyzer
        scores = analyzer.polarity_scores(text)

        # Determine the overall sentiment based on the compound score
        if scores['compound'] > 0:
            OverallSentiment = "Positive"
        elif scores['compound'] < 0:
            OverallSentiment = "Negative"
        else:
            OverallSentiment = "Neutral"
    except:
        print("An exception occurred")

    # Add the overall sentiment and sentiment scores to the corresponding columns in the dataframe
    final_df.at[index, "Overall sentiment"] = OverallSentiment
    final_df.at[index, "sentiment"] = scores

    # Increment the index to move on to the next row in the dataframe
    index += 1

# The purpose of this loop is to analyze the sentiment of the text in each PDF file and add the results to the corresponding rows in the 'final_df' dataframe. 
# The sentiment analysis is performed using VADER (Valence Aware Dictionary and sEntiment Reasoner), which is a rule-based sentiment analysis tool specifically designed for social media text. 
# The extracted text is only from a selection of pages, between 35% and 65% of the total pages. 
# The overall sentiment is determined based on the compound score of the sentiment analysis. 
# If an exception occurs during the loop, the error message "An exception occurred" is printed.

In [40]:
# Define the file location to which the final dataframe will be exported
file_location = r'<EXPORT FILE LOCATION>'

# Export the final dataframe to the defined file location as a CSV file, without the index column
final_df.to_csv(file_location, index=False)

In [66]:
from ast import literal_eval
import numpy as np

# Read the CSV file into a dataframe
df2 = pd.read_csv(file_location)

# Convert the 'sentiment' column from a string to a dictionary
df2.sentiment = df2.sentiment.apply(literal_eval)

# Normalize the nested 'sentiment' dictionary into separate columns
df2 = df2.join(pd.json_normalize(df2.pop('sentiment')))

# The purpose of this code is to convert the 'sentiment' column from a string to a dictionary, and then normalize the nested dictionary into separate columns. 
# The ast.literal_eval() function is used to safely evaluate the string expression in the 'sentiment' column as a Python literal (in this case, a dictionary). 
# The resulting 'sentiment' column is now a dictionary. 
# The pd.json_normalize() function is then used to convert the nested dictionary into separate columns in the dataframe. 
# The original 'sentiment' column is removed from the dataframe and replaced with the separate columns that were created from the normalized dictionary.

In [69]:
# Export the final dataframe to the defined file location as a CSV file, without the index column
df2.to_csv(file_location, index=False)