In [6]:
# ==========================================
# BLOCK 2: ANALYSIS & MATCH SCORE EVALUATION (FIXED)
# ==========================================

import os
import json
import re
import numpy as np
import PyPDF2
import docx
from pdf2image import convert_from_path
import pytesseract
from collections import defaultdict

# --- HELPER FUNCTIONS ---
ACT_PATTERNS = {
    'IPC': [ r'Section\s+(\d+[A-Z]*)\s+(?:of\s+)?(?:the\s+)?(?:Indian\s+Penal\s+Code|IPC)', r'(\d+[A-Z]*)\s+(?:IPC|Indian\s+Penal\s+Code)' ],
    'CrPC': [ r'Section\s+(\d+[A-Z]*)\s+(?:of\s+)?(?:the\s+)?(?:Cr\.?P\.?C\.?|CrPC)', r'(?:Cr\.?P\.?C\.?|CrPC)\s+(?:Section\s+)?(\d+[A-Z]*)' ],
    'Evidence_Act': [ r'Section\s+(\d+[A-Z]*)\s+(?:of\s+)?(?:the\s+)?Evidence\s+Act' ],
    'Arbitration_Act': [ r'Section\s+(\d+[A-Z]*)\s+(?:of\s+)?(?:the\s+)?Arbitration\s+Act' ],
    'Contract_Act': [ r'Section\s+(\d+[A-Z]*)\s+(?:of\s+)?(?:the\s+)?Contract\s+Act' ],
    'Partnership_Act': [ r'Section\s+(\d+[A-Z]*)\s+(?:of\s+)?(?:the\s+)?Partnership\s+Act' ]
}

def check_pdf_type(path):
    try:
        with open(path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = ""
            for page in reader.pages[:2]: text += page.extract_text()
            return 'text' if len(text.strip()) >= 100 else 'scanned'
    except: return 'scanned'

def get_document_text(path):
    ext = path.lower().split('.')[-1]
    if ext == 'pdf':
        if check_pdf_type(path) == 'text':
            with open(path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                return "".join([page.extract_text() for page in reader.pages])
        else:
            images = convert_from_path(path)
            return "".join([pytesseract.image_to_string(img) for img in images])
    elif ext in ['docx', 'doc']:
        doc = docx.Document(path)
        return "\n".join([para.text for para in doc.paragraphs])
    elif ext == 'txt':
        with open(path, 'r', encoding='utf-8') as f: return f.read()
    return ""

def extract_sections_with_regex(text):
    sections_found = defaultdict(list)
    for act_name, patterns in ACT_PATTERNS.items():
        for pattern in patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                try:
                    section_num = re.sub(r'[^\d\w]', '', match.group(1))
                    if not section_num or not section_num[0].isdigit(): continue
                    sections_found[f"{act_name}_{section_num}"].append(match.group(0))
                except: continue
    return sections_found

def load_hybrid_cooccurrence():
    if os.path.exists('output/hybrid_cooccurrence.json'):
        with open('output/hybrid_cooccurrence.json', 'r') as f:
            return json.load(f)
    return {}

# --- ANALYSIS LOGIC (FIXED) ---
def analyze_and_evaluate_single_doc(doc_path):
    hybrid_db = load_hybrid_cooccurrence()
    if not hybrid_db:
        print("Error: DB not found. Run Training first.")
        return

    text = get_document_text(doc_path)
    extracted = extract_sections_with_regex(text)
    
    if not extracted:
        print("No sections found.")
        return

    print(f"Analyzing: {os.path.basename(doc_path)}")
    print(f"Extracted: {list(extracted.keys())}")
    
    suggestions = {}
    extracted_keys = list(extracted.keys())
    
    for section in extracted_keys:
        if section in hybrid_db:
            data = hybrid_db[section]
            
            # 1. Gather Explicit
            for rel, count in data.get('explicit_cooccurrence', {}).items():
                if rel not in extracted_keys:
                    # Initialize with ALL keys to avoid KeyError later
                    if rel not in suggestions: 
                        suggestions[rel] = {'score': 0, 'type': 'explicit', 'count': 0, 'val': 0.0}
                    
                    suggestions[rel]['score'] += 1.0 
                    suggestions[rel]['count'] += count
            
            # 2. Gather Semantic
            for rel, score in data.get('semantic_cooccurrence', {}).items():
                if rel not in extracted_keys:
                    # Initialize if not exists
                    if rel not in suggestions: 
                        suggestions[rel] = {'score': 0, 'type': 'semantic', 'val': 0.0, 'count': 0}
                    
                    # Safety check: if it existed from Explicit loop, ensure 'val' key exists
                    if 'val' not in suggestions[rel]:
                        suggestions[rel]['val'] = 0.0
                        
                    suggestions[rel]['score'] += score 
                    suggestions[rel]['val'] = max(suggestions[rel]['val'], score)

    sorted_sug = sorted(suggestions.items(), key=lambda x: x[1]['score'], reverse=True)[:5]
    
    print("\n" + "="*60)
    print("TOP 5 SUGGESTIONS & CONFIDENCE METRICS")
    print("="*60)
    
    confidence_scores = []
    
    for sec, data in sorted_sug:
        # Normalize score
        display_score = min(data['score'], 1.0) 
        
        # Heuristic for display confidence
        if data['type'] == 'explicit':
            # Explicit links are usually very strong, boost based on count
            display_score = 0.95 + (0.01 * min(data.get('count', 0), 5))
        else:
            # Semantic links use the cosine similarity value
            display_score = data.get('val', 0.0)
            
        display_score = min(display_score, 1.0) # Cap at 1.0
        confidence_scores.append(display_score)
        
        # Visual Bar
        bar_len = int(display_score * 20)
        bar = "█" * bar_len + "░" * (20 - bar_len)
        
        print(f"Section: {sec:<20} | Confidence: {display_score:.2f} | {bar}")
        if sec in hybrid_db and hybrid_db[sec].get('contexts'):
            print(f"   Context: \"{hybrid_db[sec]['contexts'][0][:80]}...\"")
    
    if confidence_scores:
        avg_conf = np.mean(confidence_scores)
        print("-" * 60)
        print(f"Average Model Confidence: {avg_conf:.4f}")
        
        if avg_conf > 0.8: print("Metric: Model is VERY CERTAIN about these missing sections.")
        elif avg_conf > 0.5: print("Metric: Model has MODERATE certainty.")
        else: print("Metric: Model is GUESSING (Low Confidence).")
    else:
        print("No suggestions found.")

# Run
input_file = input("Enter filename from 'docs/' to analyze: ")
full_path = os.path.join('docs', input_file)
if os.path.exists(full_path):
    analyze_and_evaluate_single_doc(full_path)
else:
    print("File not found.")

Enter filename from 'docs/' to analyze:  Naveen_Singh_Naveen_Prasad_Singh_vs_State_Of_Bihar_Anr_on_21_June_2017.PDF


Analyzing: Naveen_Singh_Naveen_Prasad_Singh_vs_State_Of_Bihar_Anr_on_21_June_2017.PDF
Extracted: ['IPC_304B', 'IPC_302', 'IPC_34', 'CrPC_313', 'CrPC_354', 'CrPC_386', 'CrPC_304B', 'Evidence_Act_106', 'Evidence_Act_114']

TOP 5 SUGGESTIONS & CONFIDENCE METRICS
Section: IPC_307              | Confidence: 1.00 | ████████████████████
   Context: "offence under Section PPS 111 of 121 Conf 3-13 .doc 324 of the Indian Penal Code..."
Section: IPC_299              | Confidence: 1.00 | ████████████████████
   Context: "fences of culpable homicide, culpable homicide amounting to murder and
culpable ..."
Section: IPC_300              | Confidence: 1.00 | ████████████████████
   Context: "Referring to these Sections it was submitted by the learned Additional Public Pr..."
Section: IPC_304              | Confidence: 1.00 | ████████████████████
   Context: "r the offence under section 304 of the Indian  Penal   Code I  am   also of  
th..."
Section: CrPC_161             | Confidence: 1.00 | █████████