# PDF Reader
Running this notebook enables the reading of saved PDF type papers, which are located inside the <b>RawPaperStore</b> folder. Each PDF file is processed and the text content is saved as a TXT file inside the <b>ProcessedPaperStore</b> folder.

---

In [7]:
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
import glob
import re
import string

In [8]:
#Defining path variables

file_path = os.getcwd()
processed_dir = 'ProcessedData'
raw_dir = 'RawPaperStore'

processed_path = os.path.join(file_path, processed_dir)
raw_path = os.path.join(file_path, raw_dir)

In [9]:
#Get unprocessed paper names and paper objects 

paper_titles = []  #Saves the title of each saved paper
files_pdf = []     #Saves the PdfFileReader object, for each paper

for file in os.listdir(os.path.join(file_path, raw_dir)):
    # check only pdf files
    if file.endswith('.pdf'):
        curr_file_path = os.path.join(raw_path, file)
        files_pdf.append(PdfFileReader(curr_file_path))
        paper_titles.append(file.split('.pdf')[0])
        

In [10]:
def text_preprocessing(input_text):
    ponctuation = "#$()*+-/:;<=>@[\]^_`{|}"
    special_characters = 'å¼«¥ª°©ð±§µæ¹¢³¿®ä£⊙'
    email = '[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}'
    itemized = '[(\s][0-9a-zA-Z][.)]\s+|[(\s][ivxIVX]+[.)]\s+'
    
    processed_text = re.sub("\[.*?\]", "", input_text)                                   #Removes ref numbering (e.g. [1])
    processed_text = processed_text.translate(str.maketrans('', '', ponctuation))        #Removes some ponctuation
    processed_text = processed_text.translate(str.maketrans('', '', special_characters)) #Removes special characters
    processed_text = ' '.join(re.split('\s+', processed_text.strip(), flags=re.UNICODE)) #Removes duplicated spaces
    processed_text = re.sub(email, '', processed_text)                                   #Removes email
    processed_text = re.sub("\n", " ", processed_text)                                   #Removes PDF's new line
    processed_text = re.sub(itemized, ' ', processed_text)
    
    return processed_text

In [11]:
for paper_idx in range(len(files_pdf)):
    with open(os.path.join(processed_path, paper_titles[paper_idx])+'.txt', 'w', encoding="utf-8") as f:
        for page_num in range(files_pdf[paper_idx].numPages):

            pageObj = files_pdf[paper_idx].getPage(page_num)

            try: 
                page_content = pageObj.extractText()
                page_content = page_content.split('.\n')
                for paragraph_idx in range(len(page_content)): #For each paragraph
                    sentence = []
                    sent = page_content[paragraph_idx].split('\n')
                    for sent_idx in range(len(sent)):             #For each sent
                        if sent[sent_idx].count(' ') > 5:
                            sentence.append(sent[sent_idx])
                    paragraph = ' '.join(sentence)

                    paragraph = text_preprocessing(paragraph)
                    paragraph = paragraph + '.\n'
                    if paragraph.count(' ') > 5:
                        pass
                        f.write(paragraph)
            except:
                pass
        f.close()
        print(f'{paper_titles[paper_idx]} - was saved successfully')

Dark Energy by Robert Caldwell - was saved successfully
Dark Matter A Primer - was saved successfully
