In [None]:
!pip install PyMuPDF, transformers==4.50.3, torch==2.6.0, pytesseract, pdf2image, opencv-python

# Final V3

In [1]:
import pandas as pd
import fitz  # PyMuPDF
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np
import os
import re
from vws import RDRSegmenter, Tokenizer
from sklearn.metrics.pairwise import cosine_similarity
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
from pdf2image import convert_from_path
import cv2
import numpy as np
from collections import defaultdict

class ESGAnalyzer:
    def __init__(self, similarity_threshold=0.65):
        # Initialize PhoBERT
        self.pho_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2") 
        self.model = AutoModel.from_pretrained("vinai/phobert-base-v2")

        # NEW: Initialize Vietnamese word segmenter
        self.rdrsegment = RDRSegmenter.RDRSegmenter()  # Changed variable name
        self.token = Tokenizer.Tokenizer()
        
        # Vietnamese stopwords - expanded for ESG context
        self.stopwords = set([
            # Common Vietnamese stopwords           
        "bị", "bởi", "cả", "các", "có", "có_thể", "có_lẽ", "của",
        "cùng","cùng_với", "cũng", "đã", "đang", "đây", "để", "đều", "do", "đó",
        "khi", "là", "lại", "mà", "nên", "nếu", "những",
        "phải", "rất", "rồi", "sau", "sẽ", "thì", "từ", "và"
        ])
        
        self.similarity_threshold = similarity_threshold


    
    def segment_words(self, text):
        """Segment Vietnamese text using vws"""
        # Use segmentRawSentences to process the text
        segmented_text = self.rdrsegment.segmentRawSentences(self.token, text)
        return segmented_text
    
    def clean_text(self, text):
        """Clean and normalize Vietnamese text for ESG content"""
        # Convert to lowercase
        text = text.lower()
        
        # Remove special characters but keep Vietnamese characters and numbers
        text = re.sub(r'[^\w\s\dàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', ' ', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        
        return text
    
    def remove_stopwords(self, text):
        """Remove Vietnamese stopwords"""
        words = text.split()
        filtered_words = [word for word in words if word not in self.stopwords]
        return ' '.join(filtered_words)
    
    def preprocess_text(self, text):
        """Complete text preprocessing pipeline"""
        # Clean text (unchanged)
        text = self.clean_text(text)
        
        # Remove stopwords (unchanged)
        text = self.remove_stopwords(text)
        
        # Use vws for word segmentation instead of simple split
        segmented_text = self.segment_words(text)
        
        # Return segment words
        return segmented_text

    def extract_text_from_pdf(self, pdf_path):
        """Extract and preprocess text from PDF, handling both vector and image-only pages"""

        full_text = []
        try:
            with fitz.open(pdf_path) as doc:
                for page in doc:
                    # === 1. Extract Vector Text ===
                    text = ""
                    blocks = page.get_text("dict")["blocks"]
                    for block in blocks:
                        if "lines" in block:
                            for line in block["lines"]:
                                for span in line["spans"]:
                                    text += span["text"] + " "
                    full_text.append(text.strip())
        
                    # === 2. OCR Page as Image (Visual Handling) ===
                    try:
                        zoom = 2.0  # Higher DPI (144 DPI)
                        mat = fitz.Matrix(zoom, zoom)
                        pix = page.get_pixmap(matrix=mat)
        
                        img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, pix.n))
                        if pix.n == 4:
                            img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
                        elif pix.n == 1:
                            img_np = cv2.cvtColor(img_np, cv2.COLOR_GRAY2BGR)
        
                        # Quick precheck (low-cost confidence screening)
                        quick_ocr = pytesseract.image_to_data(
                            img_np, lang='vie', config='--oem 3 --psm 6', output_type=pytesseract.Output.DICT
                        )
                        quick_valid_chars = sum(len(word.strip()) for word, conf in zip(quick_ocr['text'], quick_ocr['conf']) if word.strip() and int(conf) > 60)
        
                        if quick_valid_chars < 6:
                            continue
        
                        # === Preprocessing for final OCR ===
                        gray = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
                        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
                        gray = clahe.apply(gray)
        
                        # Upscale if too small
                        if gray.shape[0] < 1000:
                            scale = 300 / 72
                            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
        
                        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
        
                        coords = np.column_stack(np.where(thresh > 0))
                        angle = cv2.minAreaRect(coords)[-1]
                        if angle < -45:
                            angle += 90
                        elif angle > 45:
                            angle -= 90
                        (h, w) = thresh.shape
                        M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
                        deskewed = cv2.warpAffine(thresh, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
        
                        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
                        processed = cv2.morphologyEx(deskewed, cv2.MORPH_CLOSE, kernel)
        
                        d = pytesseract.image_to_data(
                            processed, lang='vie', config='--oem 3 --psm 6', output_type=pytesseract.Output.DICT
                        )
        
                        # === Filter out meaningless pages ===
                        valid_word_count = 0
                        valid_char_count = 0
                        for i in range(len(d['text'])):
                            word = d['text'][i].strip()
                            conf = int(d['conf'][i])
                            if conf >= 70 and len(word) >= 3:
                                valid_word_count += 1
                                valid_char_count += len(word)
        
                        if valid_word_count < 3 or valid_char_count < 17:
                            continue
        
                        # === Structure OCR Text ===
                        blocks = defaultdict(lambda: defaultdict(list))
                        for i in range(len(d['text'])):
                            if int(d['conf'][i]) > 0 and d['text'][i].strip():
                                block_id = d['block_num'][i]
                                line_id = d['line_num'][i]
                                word_info = {
                                    'text': d['text'][i],
                                    'left': d['left'][i],
                                    'top': d['top'][i]
                                }
                                blocks[block_id][line_id].append(word_info)
        
                        sorted_blocks = sorted(blocks.items(), key=lambda b: min([w['top'] for line in b[1].values() for w in line]))
                        structured_text = ""
                        for block_id, lines in sorted_blocks:
                            sorted_lines = sorted(lines.items(), key=lambda l: min(w['top'] for w in l[1]))
                            for line_id, words in sorted_lines:
                                sorted_words = sorted(words, key=lambda w: w['left'])
                                line_text = " ".join([w['text'] for w in sorted_words])
                                structured_text += line_text.strip() + "\n"
        
                        full_text.append(structured_text)
        
                    except Exception as e:
                        print(f"❌ Error rendering or OCRing page {page.number}: {e}")
                        continue
        
            return "\n".join(full_text)
        except Exception as e:
            print(f"Skipped empty file: {pdf_path}")
            
    
    def get_embeddings(self, texts, batch_size=1):
        """Get embeddings in batches"""
        if isinstance(texts, str):
            texts = [texts]
        
        all_embeddings = []
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            
            
            encoded = self.pho_tokenizer(batch, 
                                   padding=True, 
                                   truncation=True,
                                   return_tensors="pt",
                                   max_length=256)
            
            with torch.no_grad():
                outputs = self.model(**encoded)
            
                # Use mean pooling instead of just [CLS] token
            attention_mask = encoded['attention_mask']
            token_embeddings = outputs.last_hidden_state
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            batch_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            

            # L2 normalization for better cosine similarity
            batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)
            batch_embeddings = batch_embeddings.numpy()
            
            all_embeddings.append(batch_embeddings)
        
        return np.vstack(all_embeddings)
    
    def read_criteria(self, excel_path):
        """Read and preprocess ESG criteria from Excel file"""
        try:
            df = pd.read_excel(excel_path)
            # Ensure we're reading just one column
            if 'criteria' in df.columns:
                criteria = df['criteria'].tolist()           # A list of criteria is collected
            else:
                # If 'criteria' column doesn't exist, take the first column
                criteria = df.iloc[:, 0].tolist()
            
            processed_criteria = [self.preprocess_text(str(c)) for c in criteria]      # Process 1 item per time in a list of criteria
            return processed_criteria
        except Exception as e:
            print(f"Error reading criteria file: {e}")
            return []

    def chunk_into_sentences(self, text):
        """Chunk the text into sentences based on delimiters (. ? ! ; ...)."""
        sentence_endings = re.compile(r'([.!?;]{1,3})')  # Regex to match sentence-ending delimiters
        sentences = re.split(sentence_endings, text)
        sentences = [sentences[i].strip() + (sentences[i + 1] if i + 1 < len(sentences) else '')
                     for i in range(0, len(sentences), 2)]  # Only take sentences (even index)
        return [s.strip() for s in sentences if s.strip()]
        
    def analyze_single_pdf(self, pdf_path, criteria_phrases):
        """Analyze a single PDF file against criteria phrases"""
        # Extract raw pdf text
        pdf_text = self.extract_text_from_pdf(pdf_path)      # full pdf file, raw and original
        if not pdf_text:
            return {phrase: "x" for phrase in criteria_phrases}

                 
        # Initialize results dictionary
        results = {phrase: 0 for phrase in criteria_phrases}
        
        # First, perform exact matching
        pdf_text_lower = pdf_text.lower()
        for phrase in criteria_phrases:
            clean_phrase = self.clean_text(phrase).lower()
            if clean_phrase in pdf_text_lower:
                results[phrase] = 1
        
        # Get list of unmatched phrases for semantic analysis
        unmatched_phrases = [phrase for phrase in criteria_phrases if results[phrase] == 0]
        
        # If all phrases were matched exactly, return results
        if not unmatched_phrases:
            return results
            
        # Otherwise, proceed with semantic similarity analysis for unmatched phrases
        sentences = self.chunk_into_sentences(pdf_text)
        if not sentences:
            return results
    
        # Preprocess_text so it is cleaned
        processed_text = [self.preprocess_text(senten) for senten in sentences]
        
        # Get embeddings only for unmatched phrases
        try:
            unmatched_embeddings = [self.get_embeddings(phrase) for phrase in unmatched_phrases]
            
            # Check semantic similarity only for unmatched phrases
            for sentence in processed_text:
                sentence_embedding = self.get_embeddings(sentence)
                if len(sentence_embedding.shape) == 1:
                    sentence_embedding = sentence_embedding.reshape(1, -1)
                for i, criterion_embedding in enumerate(unmatched_embeddings):
                    similarity = cosine_similarity(sentence_embedding, criterion_embedding)[0][0]
                    if similarity >= self.similarity_threshold:
                        results[unmatched_phrases[i]] = 1
                
            return results
        
        except Exception as e:
            print(f"Error in semantic analysis: {e}")
            return results

    
    def analyze_multiple_pdfs(self, pdf_folder, excel_path, output_path):
        """Analyze multiple PDF files and save results"""
        # Read criteria
        criteria_phrases = self.read_criteria(excel_path)
        if not criteria_phrases:
            print("No criteria found. Exiting...")
            return

        # Get list of PDF files
        pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
        if not pdf_files:
            print("No PDF files found in the specified folder.")
            return
            
        all_results = []   
        for idx, pdf_file in enumerate(pdf_files, 1):  # Start index at 1
            print(f"Processing {pdf_file}...")
            pdf_path = os.path.join(pdf_folder, pdf_file)
        
            # Analyze PDF
            results = self.analyze_single_pdf(pdf_path, criteria_phrases)
            results['PDF File'] = pdf_file
            all_results.append(results)
        
            # Save after every 2 files OR at the end of the list
            if idx % 2 == 0 or idx == len(pdf_files): 
                try:
                    results_df = pd.DataFrame(all_results)
                    results_df.to_excel(output_path, index=False)
                    print(f"Intermediate results saved to {output_path} after processing {idx} PDFs.")
                except Exception as e:
                    print(f"Error saving results after {idx} PDFs: {e}")
            
#Example usage
if __name__ == "__main__":
    analyzer = ESGAnalyzer(similarity_threshold=0.65)
    
    # Set paths
    pdf_folder = "Desktop/ESG pdf"  # Replace with your PDF folder path
    excel_path = "Desktop/esg_words.xlsx"  # Replace with your Excel file path
    output_path = "esg_pdf_reading.xlsx"
    
    # Run analysis
analyzer.analyze_multiple_pdfs(pdf_folder, excel_path, output_path)



KeyboardInterrupt

