In [8]:
import os
import fitz
import re
import json
from nltk.stem import PorterStemmer
import glob

ps = PorterStemmer()

In [9]:
def extract_text_from_folder(folder_path):
    full_texts = {}

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        if os.path.isfile(file_path) and filename.lower().endswith('.pdf'):
            doc = fitz.open(file_path)
            full_text = ""
            for page in doc:
                full_text = full_text + " " + page.get_text()
            full_texts[filename] = full_text

    return full_texts

# Provide the path to the folder containing the PDF files
pdf_folder_path = '../theses_raw'
pdf_out_folder_path = './full_text'

# Extract abstracts from PDFs in the folder
full_texts = extract_text_from_folder(pdf_folder_path)

# Save the text
for filename, full_text in full_texts.items():
    print(f'Filename: {filename}')
    with open(f'{pdf_out_folder_path}/{filename}.txt', 'w') as f:
        f.write(full_text)

Filename: 20170221 LS Lining Yu DISS.pdf
Filename: 20180212 PB Burdejova DISS.pdf
Filename: Lehmann.pdf
Filename: 20171018 TB Thijs Benschop DISS.pdf
Filename: 20180107 AZ Alona Zharova DISS.pdf
Filename: nasekin.pdf
Filename: dissertation_zharova_alona.pdf
Filename: dissertation_klochkov_yegor.pdf
Filename: Kleinow.pdf
Filename: 20161114 FS Franziska Schulz DISS.pdf
Filename: 20180531 AP Alla Pethukina DISS.pdf
Filename: 20181224 LAX Larisa Adamyan DISS.pdf
Filename: Mercurio.pdf
Filename: 20230131 DISS KH K_Haeusler.pdf
Filename: song.pdf
Filename: 20220816 MA M G Althof DISS.pdf
Filename: 20190617 EK Klochkov DISS.pdf
Filename: weining.pdf
Filename: 20181009 LZ Lenka Zbonakova DISS.pdf
Filename: 20171005 SC Shi Chen DISS.pdf
Filename: 20230415 BW Bingling Wang DISS.pdf
Filename: 20230320 MBL Min-Bin Lin DISS.pdf
Filename: lopez-cabrera.pdf
Filename: 20171218 HPT Hien Pham Thu DISS.pdf
Filename: mihoci.pdf
Filename: 20220530 IP Ingolf Pernice TU DISS.pdf
Filename: 20170120 SN Sergey 

In [10]:
def word_count(str):
    counts = dict()
    words = str.split()

    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

    return counts

In [11]:
def clean(file_path):
    '''PDF cleaning procedure'''
    
    pdf_output = ""
    
    with open(file_path) as f:
        pdf_output = f.read()
    

    # # cleaning URLs
    pdf_output = re.sub(pattern="http[^ ]*", repl=" ", string=pdf_output)

    # # cleaning symbols
    pdf_output = re.sub(pattern="\\n", repl=" ", string=pdf_output)
    pdf_output = re.sub(pattern="\W|\d", repl=" ", string=pdf_output)
    pdf_output = re.sub(pattern="[^a-zA-Z]", repl=" ", string=pdf_output)

    # # cleaning multispaces
    pdf_output = re.sub(pattern="\s{2,}", repl=" ", string=pdf_output)

    # # cleaning out 1-2-worders
    pdf_output = re.sub(pattern=" .{1,2} ", repl=" ", string=pdf_output)
    pdf_output = re.sub(pattern=" .{1,2} ", repl=" ", string=pdf_output)
    pdf_output = re.sub(pattern=" .{1,2} ", repl=" ", string=pdf_output)

    # # lower-casing
    pdf_output = pdf_output.lower()
    pdf_output_stemmed = [ps.stem(word) for word in pdf_output.split(" ")]

    search_data = ' '.join(pdf_output_stemmed)
    word_cloud = word_count(pdf_output)

    return pdf_output, word_cloud, search_data

In [12]:
for filename in os.listdir(pdf_out_folder_path):
    file_path = os.path.join(pdf_out_folder_path, filename)

    if os.path.isfile(file_path) and filename.lower().endswith('.pdf.txt'):
        pdf_output_cleaned, word_cloud, search_data = clean(file_path)
        with open(f'{pdf_out_folder_path}/cleaned/cleaned_text/{filename}.txt', 'w') as f:
            f.write(pdf_output_cleaned)
        with open(f'{pdf_out_folder_path}/cleaned/word_clouds/{filename}.json', 'w') as f:
            f.write(json.dumps(word_cloud))
        with open(f'{pdf_out_folder_path}/cleaned/stemmed_text/{filename}.txt', 'w') as f:
            f.write(search_data)

In [13]:
read_files = glob.glob(f'{pdf_out_folder_path}/cleaned/cleaned_text/*.txt')

with open(f'{pdf_out_folder_path}/cleaned/cleaned_text/all.txt', "wb") as outfile:
    for f in read_files:
        with open(f, "rb") as infile:
            outfile.write(infile.read())    

In [14]:
read_files = glob.glob(f'{pdf_out_folder_path}/cleaned/stemmed_text/*.txt')

with open(f'{pdf_out_folder_path}/cleaned/stemmed_text/all.txt', "wb") as outfile:
    for f in read_files:
        with open(f, "rb") as infile:
            outfile.write(infile.read())  