### PDF Data Pipeline

1. Download the updated [Document Compilation](https://docs.google.com/spreadsheets/d/1qxb6JL9f-UxLmj8dWVrWa0H8Lx4Y_HMbew35w4W25N4/edit#gid=2111530845) and rerun the scripts.

2. PDF files are downloaded to `/pdf_documents`.

3. Json format metadata: `/document_compilation_json/documents.json`.



In [1]:
import pandas as pd
import numpy as np
documents = pd.read_csv('Document Compilation - UNDP SEH - All.csv')
documents

Unnamed: 0,Code,Status,Country Name,Country Code,Category,KeyWord to Search,Document Title,Exists?,Type,Publication Date,Publication Year,Unnamed: 11,Start Year,End Year,Language,Link
0,AFG-CPD-2014-EN,Completed,Afghanistan,AFG,CPD,,Country programme document for Afghanistan (20...,Y,Text,2-5 September 2014,2014,,2015,2019,EN,https://digitallibrary.un.org/record/781748/fi...
1,AFG-CPD-2014-FR,,Afghanistan,AFG,CPD,,,Y,Text,2-5 September 2014,2014,,2015,2019,FR,https://digitallibrary.un.org/record/781748/fi...
2,AFG-CPD-2014-SP,,Afghanistan,AFG,CPD,,,Y,Text,2-5 September 2014,2014,,2015,2019,SP,https://digitallibrary.un.org/record/781748/fi...
3,AFG-NEP-2015-EN,Completed,Afghanistan,AFG,NEP,,RENEWABLE ENERGYPOLICY,Y,Text,2015,2015,,2015,2023,EN,https://cdn.climatepolicyradar.org/navigator/A...
4,AFG-NREP-2013-EN,Completed,Afghanistan,AFG,NREP,,Afghanistan Rural Renewable Energy Policy,Y,Text,"April, 2013",2013,,2017,2027,EN,https://cdn.climatepolicyradar.org/navigator/A...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5818,,,,,,,,,,,,,,,,
5819,,,,,,,,,,,,,,,,
5820,,,,,,,,,,,,,,,,
5821,,,,,,,,,,,,,,,,


In [None]:
import json
import pypdfium2 as pdfium
import os
import requests
from urllib.parse import urlparse
download_directory = 'pdf_documents'
os.makedirs(download_directory, exist_ok=True)

output = []
fails = []

for index, row in documents.iterrows():
    
    pdf_url = row['Link']
    try:
        response = requests.get(pdf_url)
        
        if response.status_code == 200:
            content_disposition = response.headers.get("content-disposition")
            if content_disposition:
                filename = content_disposition.split("filename=")[1].replace('"','')
            else:
                filename = os.path.basename(urlparse(pdf_url).path)
            download_path = os.path.join(download_directory, filename)

            with open(download_path, "wb") as file:
                file.write(response.content)

            pdf = pdfium.PdfDocument(download_path)
            pdf_text = ''
            for i in range(len(pdf)):
                width, height = pdf[i].get_size()
                # pdf_text += pdf[i].get_textpage().get_text_range()
                pdf_text += pdf[i].get_textpage().get_text_bounded(left=0, bottom=75, right=width, top=height-75)
            
            document_data = {
                'Code': row['Code'],
                'Status': row['Status'],
                'Country Name': row['Country Name'],
                'Country Code': row['Country Code'],
                'Category': row['Category'],
                'KeyWord to Search': row['KeyWord to Search'],
                'Document Title': row['Document Title'],
                'Exists?': row['Exists?'],
                'Type': row['Type'],
                'Publication Date': row['Publication Date'],
                'Publication Year': row['Publication Date'],
                'Start Year': row['Start Year'],
                'End Year': row['End Year'],
                'Language': row['Language'],
                'Link': row['Link'],
                'Content': pdf_text  # Extracted text content from PDF
            }
            
            output.append(document_data)
            print(f"Successfully processed document with index: {index}")
        
        else:
            print(f"Failed to process document with index: {index}")
            fails.append(index)
    except:
        print(f"Failed to download document with index: {index}")
        fails.append(index)
        
# os.makedirs('document_compilation_json', exist_ok=True)
# json_file_path = os.path.join('document_compilation_json', "documents.json")
# with open(json_file_path, "w") as json_file:
#     json.dump(output, json_file)
    
# download_directory_path = os.path.abspath(download_directory)

# print(f"Processed {len(output)} documents, saved orginal pdf files to {download_directory_path} and saved json format with extracted text to {json_file_path}.")
# documents.loc[fails].to_csv('download_fails.csv', index=False)

In [5]:
os.makedirs('document_compilation_json', exist_ok=True)
json_file_path = os.path.join('document_compilation_json', "documents.json")
with open(json_file_path, "w") as json_file:
    json.dump(output, json_file)
    
download_directory_path = os.path.abspath(download_directory)

print(f"Processed {len(output)} documents, saved orginal pdf files to {download_directory_path} and saved json format with extracted text to {json_file_path}.")
documents.loc[fails].to_csv('download_fails.csv', index=False)

Processed 751 documents, saved orginal pdf files to /Users/gaomingrui/Documents/GitHub/dsc-energy-data/pdf_documents and saved json format with extracted text to document_compilation_json/documents.json.


### Cleaning

In [None]:
with open('document_compilation_json/documents.json', 'r') as json_file:
    data = json.load(json_file)
df = pd.DataFrame(data)
df

In [None]:
import re

def remove_noice(text):
    cleaned_text = text.replace('\uf0a7', ';')
    cleaned_text = text.replace('\r', '\n')
    cleaned_text = re.sub(r"\n", " ", cleaned_text)  # remove newlines
    cleaned_text = re.sub(r"\s+", " ", cleaned_text)  # replace multiple spaces with a single space
    # cleaned_text = re.sub(r"[^a-zA-Z0-9\s]", "", cleaned_text)  # remove non-alphanumeric characters
    cleaned_text = re.sub(r"http\S+|www\S+|ftp\S+", "", cleaned_text) # remove urls
    return cleaned_text

def remove_punctuation(text):
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    return cleaned_text

import nltk
from nltk.corpus import stopwords
def remove_stopwords(text): 
    # nltk.download('stopwords') # only need to run this once
    stop_words = set(stopwords.words("english"))
    filtered_text = [word for word in text.split() if word.lower() not in stop_words]
    return " ".join(filtered_text)

def convert_to_lowercase(text): # reduce words to their root forms
    return text.lower()

import nltk
from nltk.stem import WordNetLemmatizer

def lemmatize_text(text):
    # nltk.download('wordnet') only need to run this once
    lemmatizer = WordNetLemmatizer() # tokenize the input text into words
    words = nltk.word_tokenize(text) # lemmatize each word and collect the results in a list

    original_words = []
    lemmatized_words = []
    for word in words:
        original_words.append(word)
        lemmatized_word = lemmatizer.lemmatize(word)
        lemmatized_words.append(lemmatized_word)
        # if word != lemmatized_word: # keep track of lemmatized words
        #     print(f"Word '{word}' changed to '{lemmatized_word}'")
            
    lemmatized_text = " ".join(lemmatized_words)
    return lemmatized_text

def clean(text):
    text = remove_noice(text)
    # text = remove_punctuation(text)
    # text = remove_stopwords(text)
    # text = convert_to_lowercase(text)
    text = lemmatize_text(text)
    return text

def clean_all(row):
    content = row['Content'] 
    return clean(content)

df['clean_content'] = df.apply(clean_all, axis=1)

In [None]:
def clean_all(row):
    content = row['Content'] 
    return clean(content)

df['clean_content'] = df.apply(clean_all, axis=1)

import os
json_file_path = os.path.join('document_compilation_json', "documents_cleaned.json")
with open(json_file_path, "w") as json_file:
    json.dump(df, json_file)