In [2]:
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from pdf2image import convert_from_path
import pytesseract

# Ensure necessary NLTK datasets are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Lowercase conversion
    text = text.lower()

    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize text
    words = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

    # Reconstruct cleaned text
    cleaned_text = ' '.join(lemmatized_words)
    return cleaned_text


def process_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    extracted_text = []

    for image in images:
        text = pytesseract.image_to_string(image)
        extracted_text.append(text)

    combined_text = ' '.join(extracted_text)
    return (combined_text)

def save_text(file_path, text):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text)

def main(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            preprocessed_text = process_pdf(pdf_path)
            txt_filename = f"{os.path.splitext(filename)[0]}.txt"
            txt_path = os.path.join(folder_path, txt_filename)
            save_text(txt_path, preprocessed_text)
            print(f"Processed and saved: {txt_filename}")

# Set the folder path containing PDFs
folder_path = 'attempt1'

# Run the main function
main(folder_path)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Processed and saved: 2024-05-25_1710233226315.txt
Processed and saved: 2024-05-25_1710233256921.txt
Processed and saved: 2024-05-25_1710844257374.txt
Processed and saved: 2024-05-25_1710844645454.txt
Processed and saved: 2024-05-25_1710844724589.txt
Processed and saved: 2024-05-25_1711626662149.txt
Processed and saved: 2024-05-25_1711629492582.txt
Processed and saved: 2024-05-25_1712213372218.txt
Processed and saved: 2024-05-25_1712577671025.txt
Processed and saved: 2024-05-25_1713162625498.txt
Processed and saved: 2024-05-25_1713162688513.txt
Processed and saved: 2024-05-25_1714021680590.txt
Processed and saved: 2024-05-25_1714643146091.txt
Processed and saved: 2024-05-25_1714719134107.txt
Processed and saved: 2024-05-25_1714729983428.txt
Processed and saved: 2024-05-25_1714730487754.txt
Processed and saved: 2024-05-25_1714995804765.txt
Processed and saved: 2024-05-25_1714996095310.txt
Processed and saved: 2024-05-25_1715073171256.txt
Processed and saved: 2024-05-25_1715160491738.txt
