In [1]:
import pdfplumber
import re
import spacy
import random
import os
from tqdm import tqdm 

In [2]:
def extract_text_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            # extract words
            words = page.extract_words(x_tolerance=1, keep_blank_chars=False)
            page_text = " ".join([word["text"] for word in words])
            text += page_text + "\n"
    return re.findall('[aA-zZ]+', text), text

In [3]:
def extract_text(text_path):
    import re
    with open(text_path, encoding='utf-8') as intxt:
        data = intxt.read()
    return re.findall('[aA-zZ]+', data)
    

In [4]:
nlp = spacy.load("en_core_web_sm")

def shuffle_sentences(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    random.shuffle(sentences)
    return " ".join(sentences)

In [5]:
def replace_keywords(text):
    replacements = {
        "reinforcement learning": "supervised learning",
        "convolutional neural network": "random forest",
        "ImageNet": "MNIST",
        "natural language processing": "computer vision",
        "accuracy": "F1-score",
        "mean squared error": "precision",
        "hyperparameter tuning": "data augmentation",
        "backpropagation": "meta-optimization synthesis",
    }
    for old, new in replacements.items():
        text = re.sub(r'\b' + re.escape(old) + r'\b', new, text, flags=re.IGNORECASE)
    return text

In [6]:
input_dir = "incoherent_dataset_unprocessed_sample"
output_dir = "research-paper-analyser/incoherent dataset"

for filename in tqdm(os.listdir(input_dir)):
    if filename.endswith(".pdf"):
        try:
            list, text = extract_text_pdf(os.path.join(input_dir, filename))
            incoherent_text = shuffle_sentences(text)
            incoherent_text = replace_keywords(text)
            filename = filename[:-4]
            with open(os.path.join(output_dir, f"incoherent_{filename}.txt"), "w", encoding="utf-8") as f:
                f.write(incoherent_text)
        except UnicodeError:
            print(f"Encoding error in file: {filename}")

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [01:13<00:00, 24.40s/it]
