In [28]:
#Import all Libraries - RUN THIS BEFORE ANY TESTING

import os
import requests
import tkinter as tk
import PyPDF2
import re
from tkinter import filedialog
from bs4 import BeautifulSoup
import requests
import time
import torch
import soundfile as sf
from pydub import AudioSegment

In [29]:
#This cell points the script to GROBID's API, allows the user to choose a PDF and specify a page range, then saves the edited PDF.

#RUN GROBID FIRST USING ./gradlew run IN THE GROBID DIRECTORY

# Define the URL for GROBID's PDF conversion endpoint
grobid_url = "http://localhost:8070/api/processFulltextDocument"

#Prompt user to select PDF file
root = tk.Tk()
root.withdraw()
pdf_file_path = filedialog.askopenfilename()

# Open the PDF file in binary mode
pdf_file = open(pdf_file_path, 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)

# Prompt user to enter start and end page numbers
start_page = int(input("Enter start page number: "))
end_page = int(input("Enter end page number: "))

# Limit end page to total number of pages in PDF
if end_page > len(pdf_reader.pages):
    end_page = len(pdf_reader.pages)
    
# Create a PDF writer object
pdf_writer = PyPDF2.PdfWriter()

# Loop over the selected pages and add them to the PDF writer
for page_num in range(start_page-1, end_page):
    pdf_writer.add_page(pdf_reader.pages[page_num])

# Save the output PDF file to the same directory as the original PDF
output_file_path = os.path.splitext(pdf_file_path)[0] + '_EDITED.pdf'
with open(output_file_path, 'wb') as f:
    pdf_writer.write(f)
    
edited_pdf = output_file_path
print("Saved to" + edited_pdf)

Enter start page number:  1
Enter end page number:  100


Saved to/home/sean/Downloads/1RobertsvQueen3_EDITED.pdf


In [30]:
#This cell defines the parameters for GROBID's XML conversion, parses text through GROBID and outputs it as a .xml file

# Redefine pdf_file_path as the new, edited PDF file
pdf_file_path = output_file_path
    
## Send the selected pages of the PDF file to GROBID for conversion to XML with sentence-level segmentation
with open(pdf_file_path, 'rb') as f:
    # Set the start and end page parameters in the API call, and specify both teiCoordinates and sentence-level segmentation
    params = {
        'start': start_page,
        'end': end_page,
        'segmentation': 'teiCoordinates,sentences',
        'consolidate_citations': False,
        'consolidate_header': False,
        'tei': '''
            <tei>
                <teiHeader>
                    <!-- Add any modifications to the header here -->
                </teiHeader>
                <text>
                    <body>
                        <div type="abstract">
                            {abstract}
                        </div>
                        <div type="main">
                            {main}
                        </div>
                    </body>
                </text>
            </tei>
        ''',
        'input': 'file',
        'consolidateHeader': 1,
        'consolidateCitations': 1,
        'includeRawCitations': 0,
        'teiCoordinates': 1,
        'disableLinks': 1, # Add this parameter to disable footnote extraction
        'disableFootnotes': 1, # Add this parameter to disable footnotes processing
    }
    files = {'input': f}
    response = requests.post(grobid_url, params=params, files=files)

# Save the GROBID XML response to the same file with an appended '_GROBID' suffix and a .xml extension
grobid_filename = os.path.splitext(pdf_file_path)[0] + f'_{start_page}-{end_page}_GROBID.xml'
with open(grobid_filename, 'w') as f:
    f.write(response.text)
    
os.remove(edited_pdf)

In [31]:
#This cell extracts the body of the .xml file, preserving all tags.

# Read in the contents of the GROBID XML file
with open(grobid_filename, 'r') as file:
    file_contents = file.read()

# Extract everything between the <body> and </body> tags
body_contents = re.search('(<body>)(.*)(</body>)', file_contents, re.DOTALL).group(0)

# Construct the new file name by appending "2" to the original file name
new_file_name = os.path.splitext(grobid_filename)[0] + "2.xml"

# Write the extracted contents to a new file with the new file name
with open(new_file_name, 'w') as new_file:
    new_file.write(body_contents)
    
cleaned_xml = new_file_name
print("Saved cleaned .xml to:" + cleaned_xml)

#Removes the intermediary file
os.remove(grobid_filename)

Saved cleaned .xml to:/home/sean/Downloads/1RobertsvQueen3_EDITED_1-26_GROBID2.xml


In [32]:
#This cell formats the .xml file

# Open the existing .xml file
with open(cleaned_xml, 'r') as f:
    xml_content = f.read()

# Replace <s> and <p> tags, and add a line break to <head> tags in .xml
soup = BeautifulSoup(xml_content, 'xml')
for tag in soup.find_all(['p', 's']):
    tag.replace_with(tag.text + ' ')
for tag in soup.find_all('head'):
    tag.replace_with('\n' + tag.text + '\n')
cleaned_text = soup.get_text()

# Remove HTML and CSS tags from the cleaned text
soup = BeautifulSoup(cleaned_text, 'html.parser')
cleaned_text = soup.get_text()

# Remove HTML and CSS tags from the cleaned text
soup = BeautifulSoup(cleaned_text, 'html.parser')
for tag in soup.find_all(['style', 'script']):
    tag.unwrap()
cleaned_text = soup.prettify(formatter=None)

# Save the final cleaned text in a .txt file with an appended '_CLEANED' suffix
cleaned_filename = os.path.splitext(cleaned_xml)[0] + '_CLEANED.txt'
with open(cleaned_filename, 'w') as f:
    f.write(cleaned_text)

#Removes the intermediary file, saves new variables for next cell
os.remove(cleaned_xml)
filename = cleaned_filename
print("Cleaned up" + filename)

Cleaned up/home/sean/Downloads/1RobertsvQueen3_EDITED_1-26_GROBID2_CLEANED.txt


In [33]:
#This cell uses regular expressions to clean up the text

# Define the regular expressions to use for cleaning the text
square_brackets_pattern = r'\[[^\[\]a-zA-Z]{1,30}\]'
parentheses_pattern = r'\([^()]{1,75}\)'
url_regex = re.compile(r'(?:https?://|www\.)\S+\b')
date_regex = re.compile(r'\d{1,2}\/\d{1,2}\/(?:\d{4}|\d{2})|\d{4}-\d{2}-\d{2}')
timestamp_regex = re.compile(r'\b\d{1,2}(?::\d{2}){1,2}(?: ?[AP]M)?(?!\w|\.)\b')
doi_regex = re.compile(r'\b10\.\d{4,9}/[-._;()/:A-Z0-9]+|\bdoi:\S+\b|\bdoi:\S+\b|\bDOI:\S+\b')
email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
grobid_regex = re.compile(r'GROBID - A machine learning software for extracting information from scholarly documents')
uuid_regex = re.compile(r'\b[0-9a-fA-F]{32}\b')
aglc_regex = re.compile(r'\b[A-Z]{1,3}\s[1-9]\d{0,2}(?:\([1-9]\d{0,2}\))?')

# Add space before 2 or 3 digit numbers using regular expression
cleaned_text = re.sub(r'(?<=\D)(\d{2,3})\b', r' \1', cleaned_text)

# Define a regular expression pattern to match the end of a sentence
sentence_end_regex = re.compile(r'([.!?])\s+')

# Read in the file and remove the matched text using the regular expressions
with open(filename, 'r') as f:
    text = f.read()

# Remove Matched Text
text = re.sub(square_brackets_pattern, '', text)
text = re.sub(parentheses_pattern, '', text)
text = aglc_regex.sub('', text)
text = url_regex.sub('', text)
text = date_regex.sub('', text)
text = doi_regex.sub('', text)
text = email_regex.sub('', text)
text = grobid_regex.sub('', text)
text = uuid_regex.sub('', text)

#Use a regular expression pattern to add a line break after each sentence
text = sentence_end_regex.sub(r'\1\n', text)

# Ask the user to choose a file name and location to save the processed text
root = tk.Tk()
root.withdraw()
new_file_path = filedialog.asksaveasfilename(defaultextension='.txt')

# Create a new file with the cleaned text
with open(new_file_path, 'w') as f:
    f.write(text)

# Remove the original file
os.remove(filename)

filename = new_file_path
print(filename)

/home/sean/Downloads/Roberts v Queen.txt


In [34]:
#This cell chooses a language, model and vocoder from Espnet2

lang = 'English'
tag = 'kan-bayashi/ljspeech_fastspeech2' #@param ["kan-bayashi/ljspeech_tacotron2", "kan-bayashi/ljspeech_fastspeech", "kan-bayashi/ljspeech_fastspeech2", "kan-bayashi/ljspeech_conformer_fastspeech2", "kan-bayashi/ljspeech_joint_finetune_conformer_fastspeech2_hifigan", "kan-bayashi/ljspeech_joint_train_conformer_fastspeech2_hifigan", "kan-bayashi/ljspeech_vits"] {type:"string"}
vocoder_tag = "parallel_wavegan/ljspeech_parallel_wavegan.v1" #@param ["none", "parallel_wavegan/ljspeech_parallel_wavegan.v1", "parallel_wavegan/ljspeech_full_band_melgan.v2", "parallel_wavegan/ljspeech_multi_band_melgan.v2", "parallel_wavegan/ljspeech_hifigan.v1", "parallel_wavegan/ljspeech_style_melgan.v1"] {type:"string"}

In [35]:
#This cell defines the parameters for Espnet2 text to speech

from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none

text2speech = Text2Speech.from_pretrained(
    model_tag=str_or_none(tag),
    vocoder_tag=str_or_none(vocoder_tag),
    device="cuda",
    # Only for Tacotron 2 & Transformer
    threshold=0.5,
    # Only for Tacotron 2
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=False,
    backward_window=1,
    forward_window=3,
    # Only for FastSpeech & FastSpeech2 & VITS
    speed_control_alpha=1.0,
    # Only for VITS
    noise_scale=0.333,
    noise_scale_dur=0.333,
)

In [36]:
#This cell converts the cleaned and processed text into a text to speech reading, breaking the text into chunks to preserve VRAM, and converting the output to an .mp3

torch.cuda.empty_cache()

# Split the text into chunks of 256 characters
chunk_size = 256

# Get the file extension and use it to determine the output file extension
file_extension = os.path.splitext(filename)[1]

if file_extension == '.txt':
    with open(filename, 'r') as f:
        text = f.read()
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    out_extension = '.wav'
elif file_extension == '.wav':
    chunks = [filename]
    out_extension = '.wav'
else:
    print("Invalid file type.")
    exit()

# synthesis
sound_files = []
for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}/{len(chunks)}")
    with torch.no_grad():
        wav = text2speech(chunk)["wav"]
        out_filename = f"out{i+1}{out_extension}"
        sf.write(out_filename, wav.view(-1).cpu().numpy(), text2speech.fs, "PCM_16")
        sound_files.append(AudioSegment.from_file(out_filename, out_extension.replace('.', '')))

# Combine all the sound files
combined_sounds = sound_files[0]
for sound in sound_files[1:]:
    combined_sounds += sound

# Export the combined sound file as a .wav file
out_filename = os.path.splitext(filename)[0] + out_extension
combined_sounds.export(out_filename, format=out_extension.replace('.', ''))

# Convert the output file from .wav to .mp3
mp3_filename = os.path.splitext(filename)[0] + '.mp3'
AudioSegment.from_wav(out_filename).export(mp3_filename, format='mp3')

# Delete intermediate sound files
for i in range(1, len(chunks)+1):
    filename = f"out{i}{out_extension}"
    if os.path.exists(filename):
        os.remove(filename)

# Delete the temporary .wav file
if os.path.exists(out_filename):
    os.remove(out_filename)

print("Processing completed.")

Processing chunk 1/297
Processing chunk 2/297


KeyboardInterrupt: 