In [2]:
# Importing necessary libraries
import sys, os

# Setting the root directory as a string.
root = r'<ROOT LOCATION OF REPORT FOLDER>'

# Combining the root directory with the target directory to create the full path
path = os.path.join(root, "targetdirectory")

# Create empty lists to store the full file paths and file names
fullFilePaths = []
fileNames = []

# Walk through all the subdirectories and files within the root directory
for path, subdirs, files in os.walk(root):
    # Loop through all the files in each subdirectory
    for name in files:
        # Create the full file path by joining the path and file name
        fullFilePaths.append(os.path.join(path, name))

# The final output will be the full file paths of all the files within the specified root directory.

In [3]:
# Import the pandas library to work with dataframes
import pandas as pd

# Specify the filepath of the GRI excel file using a raw string to prevent backslash escaping
griFullFileName = r'<GRI FILE LOCATION>'

# Open the GRI excel file using pd.ExcelFile() and save it to the variable xls
xls = pd.ExcelFile(griFullFileName)

# Create an empty dictionary to store the dataframes
fullGRI = {}

# Loop through each sheet in the Excel file except for the "Overview" sheet
# and read the sheet into a dataframe using pd.read_excel()
# Add each dataframe to the fullGRI dictionary with the sheet name as the key
for sheet_name in xls.sheet_names[1:21]:
    fullGRI[sheet_name] = pd.read_excel(griFullFileName, sheet_name = sheet_name, header=1)

# Read the "Overview" sheet into a dataframe separately and add it to the fullGRI dictionary with the key "Overview"
fullGRI["Overview"] = pd.read_excel(griFullFileName, sheet_name = "Overview")

# Create a new dataframe named total_df by concatenating all dataframes in the fullGRI dictionary except for the "Overview" sheet
# Set ignore_index argument to True to create a new index for the concatenated dataframe
total_df = fullGRI["1999"]
for sheet_name in xls.sheet_names[2:21]:
    total_df = pd.concat([total_df, fullGRI[sheet_name]], ignore_index=True)

# Modify the 'Name' column in total_df by removing all spaces using the str.replace() method
total_df['Name'] = total_df['Name'].str.replace(' ', '')

# The resulting concatenated dataframe total_df contains all GRI data from 1999 to 2018

In [4]:
# Import the regular expressions library to work with patterns
import re

# Create an empty dataframe with two columns named "Organization" and "Year"
df = pd.DataFrame(columns=["Organization", "Year"])

# Loop through each file path in the fullFilePaths list
for filePath in fullFilePaths:
    # Extract the file name without extension from the file path
    fileName = os.path.splitext(os.path.split(filePath)[1])[0]

    # Create a list of two items by splitting the file name using "_" as a separator
    # The first item is the organization name, and the second item is the year
    row = [fileName.split("_")[0], fileName.split("_")[-1]]

    # Add the row to the dataframe at the next available index using df.loc[]
    df.loc[len(df)] = row

# Convert the "Year" column to numeric values using pd.to_numeric()
df['Year'] = pd.to_numeric(df['Year'])

# The resulting dataframe df contains information about the PDF files, including the organization name and year

In [5]:
# Import the numpy library for numerical operations
import numpy as np

# Merge the PDF files dataframe (df) with the GRI dataframe (total_df) using the organization name and year as the keys
almost_df = pd.merge(df, total_df,  how='left', left_on=['Organization','Year'], right_on = ['Name','Publication Year'])

# Drop the "Name" and "Publication Year" columns from the merged dataframe
final_df = almost_df.drop(['Name','Publication Year'], axis=1)

# Drop any duplicate rows based on the "Organization" and "Year" columns
# This ensures that each row in the resulting dataframe corresponds to a unique organization and year combination
final_df = final_df.drop_duplicates(subset=['Organization', 'Year']).reset_index(drop=True)

# add a new column called "Topics" with NaN values
final_df["Topics"] = np.nan
# OUTPUT: ONE FULL MERGED DATAFRAME WITH ALL THE PDF FILES AND GRI INFORMATION

In [6]:
# Import the stopwords from the NLTK corpus
from nltk.corpus import stopwords

# Create an empty set to store all stop words
all_stop_words = set(stopwords.words('english'))

# Define a list of additional languages for which to include stopwords
languages = ["spanish", "chinese", "portuguese", "greek", "russian", 
             "italian", "finnish", "german", "indonesian", "norwegian", 
             "swedish", "french", "turkish", "hungarian", "romanian"]

# Loop through each language and add the corresponding stopwords to the set
for language in languages:
    stop_words = set(stopwords.words(language))
    all_stop_words = all_stop_words.union(stop_words)

# The purpose of this code is to create a big list of stopwords for multiple languages. 
# The first step is to initialize an empty set with the stop words for the English language, as provided by the nltk.corpus module.
# Next, a list of additional languages for which to include stopwords is defined. 
# A loop is then used to iterate over each language in the list, and for each language, the corresponding set of stopwords is obtained using the stopwords.words() function and stored in the 'stop_words' variable.
# The union of the 'stop_words' set and the 'all_stop_words' set is then taken and stored in the 'all_stop_words' set, to create a bigger set of stopwords for multiple languages. 
# The final output is a big list of all the needed stopwords.

In [None]:
# Import necessary libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from gensim import corpora, models
from PyPDF2 import PdfReader

# Function to get unique words from a list
def getUniqueWords(allWords):
    uniqueWords = [] 
    for i in allWords:
        if not i in uniqueWords:
            uniqueWords.append(i)
    return uniqueWords

# Initialize index counter for tracking progress
index = 0

# Iterate through each PDF file path in the list of file paths
for filePath in fullFilePaths:
    # Print progress update every 10 files
    if index % 10 == 0:
        print(index)
    
    try:
        # Read PDF file using PyPDF2 and extract text
        reader = PdfReader(filePath)
        text = ""
        first_page = round(len(reader.pages) * 0.10)
        last_page = round(len(reader.pages) * 0.90)

        for page_number in range(len(reader.pages)):
            page = reader.pages[page_number]
            page_text = page.extract_text()
            text += page_text

        # Tokenize words
        tokens = word_tokenize(text)

        # Remove stopwords and punctuation
        clean_tokens = [token.lower() for token in tokens if token.lower() not in all_stop_words and token.isalpha()]

        # Lemmatize tokens
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in clean_tokens]

        # Find bigrams
        bigram_finder = BigramCollocationFinder.from_words(lemmatized_tokens)
        bigrams = bigram_finder.nbest(BigramAssocMeasures().raw_freq, 10)

        # Create bag of words model
        dictionary = corpora.Dictionary([lemmatized_tokens])
        corpus = [dictionary.doc2bow(lemmatized_tokens)]

        # Train LDA model
        lda_model = models.ldamodel.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10)
        
        # Print topics
        lst = []
        for i, topic in lda_model.show_topics(num_topics=10, formatted=False):
            lst += [word[0] for word in topic]
        
        # Get an unique list of all the top topics
        topics = getUniqueWords(lst)
            
    except:
        print("An exception occurred")
        topics = "error"
    
    # Add the topics to the final data frame for this PDF file
    final_df.at[index, "Topics"] = topics
    
    # Increment the index counter
    index += 1

In [9]:
# Define the file location to which the final dataframe will be exported
file_location = r'<EXPORT FILE LOCATION>'

# Export the final dataframe to the defined file location as a CSV file, without the index column
final_df.to_csv(file_location, index=False)