<a href="https://colab.research.google.com/github/VOX304/SchoolChatbot/blob/main/Preprocessing/Colab_preprocessing/text_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install python-docx pdfplumber chardet


Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m560.4 kB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m981.7 kB/s[0m eta [36m0:00:00[0m
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/

In [None]:
!pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [None]:
import os
import docx
import pdfplumber
import chardet
import pytesseract
import re
from pathlib import Path
from PIL import Image

def detect_encoding(file_path):
    """Detect encoding of a text file."""
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
    return result['encoding']

def extract_text_from_docx(doc_path):
    """Extract text from DOCX files."""
    try:
        doc = docx.Document(doc_path)
        return '\n'.join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"Error extracting DOCX text: {e}")
        return None

def extract_text_from_pdf(pdf_path):
    """Extract text from PDFs using pdfplumber."""
    try:
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + '\n'
        return text if text.strip() else None
    except Exception as e:
        print(f"Error extracting PDF text: {e}")
        return None

def detect_corrupted_text(text):
    """Detect if extracted text is corrupted based on Vietnamese character distribution."""
    if text is None or len(text.strip()) == 0:
        return True  # Empty or None text is considered corrupted

    # Check for non-ASCII characters ratio
    non_ascii_ratio = sum(1 for char in text if ord(char) > 127) / len(text)
    if non_ascii_ratio < 0.1:  # Too few non-ASCII characters (likely missing Vietnamese diacritics)
        return True

    # Check for valid Vietnamese character frequency (heuristic approach)
    vietnamese_chars = "áàảãạâấầẩẫậăắằẳẵặéèẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵđ"
    vietnamese_char_ratio = sum(1 for char in text if char in vietnamese_chars) / max(len(text), 1)

    if vietnamese_char_ratio < 0.05:  # Less than 5% of characters are Vietnamese diacritics
        return True

    return False

def apply_ocr(image_path):
    """Apply OCR on an image to extract text."""
    try:
        return pytesseract.image_to_string(Image.open(image_path))
    except Exception as e:
        print(f"Error during OCR: {e}")
        return None

def process_document(file_path):
    """Main function to process document for text extraction and corruption handling."""
    ext = Path(file_path).suffix.lower()
    extracted_text = None

    if ext in ['.doc', '.docx']:
        extracted_text = extract_text_from_docx(file_path)
    elif ext == '.pdf':
        extracted_text = extract_text_from_pdf(file_path)
    else:
        print("Unsupported file type!")
        return

    attempts = 3
    while detect_corrupted_text(extracted_text) and attempts > 0:
        print("Detected corrupted text. Attempting re-encoding...")
        encoding = detect_encoding(file_path)
        try:
            with open(file_path, 'r', encoding=encoding, errors='replace') as f:
                extracted_text = f.read()
        except Exception as e:
            print(f"Encoding correction failed: {e}")
        attempts -= 1

    if detect_corrupted_text(extracted_text):
        print("Alert: Extracted text still appears corrupted. Applying OCR as fallback.")
        extracted_text = apply_ocr(file_path)

    if extracted_text:
        print("Extracted Text:")
        print(extracted_text[:500])  # Print first 500 characters for preview
    else:
        print("Final alert: Text extraction failed.")

# Example usage
file_path = "/content/sample_data/tcu_doc/HD1860 công tác xét tuyển đào tạo ĐH, CĐ năm 2024_20252211126.pdf"  # Change to your file path
process_document(file_path)


Detected corrupted text. Attempting re-encoding...
Detected corrupted text. Attempting re-encoding...
Detected corrupted text. Attempting re-encoding...
Alert: Extracted text still appears corrupted. Applying OCR as fallback.
Error during OCR: cannot identify image file '/content/sample_data/tcu_doc/HD1860 công tác xét tuyển đào tạo ĐH, CĐ năm 2024_20252211126.pdf'
Final alert: Text extraction failed.
