In [4]:
import os
import re
import fitz  # PyMuPDF for PDFs
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    print(f"Text extracted from {pdf_path}")
    return text

def clean_text(text):
    # Remove References section (if applicable)
    text = re.sub(r'References.*', '', text, flags=re.DOTALL)

    # Remove headers/footers (example pattern, adjust as needed)
    text = re.sub(r'Header text pattern.*', '', text, flags=re.MULTILINE)
    text = re.sub(r'Footer text pattern.*', '', text, flags=re.MULTILINE)

    # Remove non-alphanumeric characters (if necessary) and extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s.,?!:;\'"-]', '', text)
    text = re.sub(r'\s+', ' ', text)  # Collapse multiple spaces
    text = text.strip()
    
    # Optional: Convert to lowercase to standardize
    text = text.lower()
    
    return text

def chunk_text(text, chunk_size=500):
    # Process the text with SpaCy
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    chunks = []
    chunk = ""

    for sentence in sentences:
        if len(chunk) + len(sentence) > chunk_size:
            chunks.append(chunk)
            chunk = sentence
        else:
            chunk += " " + sentence

    if chunk:
        chunks.append(chunk)
    print(f"Chunks extracted")
    return chunks

def process_files_in_folder(folder_path):
    combined_text = ""
    
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        print(f"{file_name} is under processing...")
        if file_name.endswith(".pdf"):
            raw_text = extract_text_from_pdf(file_path)
        else:
            continue  # Skip non-supported file types
        
        cleaned_text = clean_text(raw_text)
        chunks = chunk_text(cleaned_text)
        
        # Combine all chunks into one text (can save to individual files or a combined file)
        combined_text += "\n".join(chunks) + "\n"
    
    return combined_text

# Specify the folder containing your research files
folder_path = "resistant_research_papers"

# Process the files in the folder and get the combined cleaned text
combined_cleaned_text = process_files_in_folder(folder_path)

# Save the combined cleaned and chunked text to a file
output_file = "resistant.txt"
with open(output_file, 'w', encoding='utf-8') as file:
    file.write(combined_cleaned_text)

print(f"Combined text has been saved to {output_file}")

ijerph-16-04897.pdf is under processing...
Text extracted from resistant_research_papers/ijerph-16-04897.pdf
Chunks extracted
msse-53-1206.pdf is under processing...
Text extracted from resistant_research_papers/msse-53-1206.pdf
Chunks extracted
2102.00836v2.pdf is under processing...
Text extracted from resistant_research_papers/2102.00836v2.pdf
Chunks extracted
fphys-12-791999.pdf is under processing...
Text extracted from resistant_research_papers/fphys-12-791999.pdf
Chunks extracted
fspor-04-949021.pdf is under processing...
Text extracted from resistant_research_papers/fspor-04-949021.pdf
Chunks extracted
jfmk-09-00009.pdf is under processing...
Text extracted from resistant_research_papers/jfmk-09-00009.pdf
Chunks extracted
Combined text has been saved to resistant.txt
