In [None]:
pip install PyPDF2 nltk spellchecker


In [None]:
import re
from PyPDF2 import PdfReader
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from spellchecker import SpellChecker

# Ensure you have the necessary NLTK datasets
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Step 1: Read PDF content
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Step 2: Clean text
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove non-alphanumeric chars
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespaces
    return text.lower()

# Step 3: Tokenize text
def tokenize_text(text):
    return word_tokenize(text)

# Step 4: Remove stopwords
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

# Step 5: Correct spelling
def correct_spelling(tokens):
    spell = SpellChecker()
    return [spell.correction(token) for token in tokens]

# Step 6: Perform stemming
def stem_tokens(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

# Combine all steps to clean the PDF
def clean_pdf(pdf_path):
    # Read and clean text
    text = extract_text_from_pdf(pdf_path)
    text = clean_text(text)
    
    # Tokenize text
    tokens = tokenize_text(text)
    
    # Remove stopwords
    tokens = remove_stopwords(tokens)
    
    # Correct spelling
    tokens = correct_spelling(tokens)
    
    # Perform stemming
    tokens = stem_tokens(tokens)
    
    return tokens

# Example usage:
pdf_path = 'your_pdf_file.pdf'  # Replace with your PDF file path
cleaned_tokens = clean_pdf(pdf_path)
print(cleaned_tokens)
