In [1]:
import pdfplumber
import re
import spacy
import random
import os
from tqdm import tqdm 

In [None]:
def extract_text_pdf(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            for page in pdf.pages:
                # extract words
                words = page.extract_words(x_tolerance=1, keep_blank_chars=False)
                page_text = " ".join([word["text"] for word in words])
                text += page_text + "\n"
        return re.findall('[aA-zZ]+', text), text
    except:
        print(f"File {pdf_path} was corrupted.")
        return

In [3]:
def extract_text(text_path):
    import re
    with open(text_path, encoding='utf-8') as intxt:
        data = intxt.read()
    return re.findall('[aA-zZ]+', data)
    

In [4]:
nlp = spacy.load("en_core_web_sm")

def shuffle_sentences(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    random.shuffle(sentences)
    return " ".join(sentences)

In [5]:
def replace_keywords(text):
    replacements = {
        "reinforcement learning": "supervised learning",
        "convolutional neural network": "random forest",
        "ImageNet": "MNIST",
        "natural language processing": "computer vision",
        "accuracy": "F1-score",
        "mean squared error": "precision",
        "hyperparameter tuning": "data augmentation",
        "backpropagation": "meta-optimization synthesis",
    }
    for old, new in replacements.items():
        text = re.sub(r'\b' + re.escape(old) + r'\b', new, text, flags=re.IGNORECASE)
    return text

In [None]:
input_dir = "datasets/incoherent_dataset_unprocessed"
output_dir = "datasets/incoherent dataset"

for filename in tqdm(os.listdir(input_dir)):
    if filename.endswith(".pdf"):
        try:
            result = extract_text_pdf(os.path.join(input_dir, filename))
            # if the result is None, error
            if result is None:
                raise ValueError("extract_text_pdf returned None")
            # Unpack the result
            list, text = result
            # Process the text
            incoherent_text = shuffle_sentences(text)
            incoherent_text = replace_keywords(text)
            filename = filename[:-4] # remove .pdf from file name
            with open(os.path.join(output_dir, f"incoherent_{filename}.txt"), "w", encoding = "utf-8") as f:
                f.write(incoherent_text)
        except UnicodeError:
            print(f"Encoding error in file: {filename}.")
        except ValueError as e:
            print(f"Error: {e}. Skipping file {filename}.")
        except Exception as e:
            print(f"An unexpected error occurred while processing {filename}: {e}")

  0%|          | 0/46 [00:00<?, ?it/s]

File incoherent_dataset_unprocessed\2107.05341.pdf was corrupted.
Error: extract_text_pdf returned None. Skipping file 2107.05341.pdf.


100%|██████████| 46/46 [19:10<00:00, 25.02s/it]
