In [4]:
# ==========================================
# BLOCK 3: CLASSIFICATION, STORAGE & DEBUGGING (FIXED)
# ==========================================

import os
import json
import shutil
import google.generativeai as genai
from sklearn.metrics import accuracy_score
import PyPDF2
import docx
from pdf2image import convert_from_path
import pytesseract

# --- 1. SETUP API KEY ---
genai.configure(api_key='AIzaSyDdHJT_NLWjHohF3MvO8kE93EcED_HU41I') 
llm = genai.GenerativeModel('gemini-2.5-flash')

# --- 2. DEFINE GROUND TRUTH ---
GROUND_TRUTH = {
    'Naveen_Singh_Naveen_Prasad_Singh_vs_State_Of_Bihar_Anr_on_21_June_2017.PDF': {
        'nature': 'Criminal', 
        'category': 'Murder' 
    }
}

# --- HELPER FUNCTIONS ---
def check_pdf_type(path):
    try:
        with open(path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = ""
            for page in reader.pages[:2]: text += page.extract_text()
            return 'text' if len(text.strip()) >= 100 else 'scanned'
    except: return 'scanned'

def get_document_text(path):
    ext = path.lower().split('.')[-1]
    if ext == 'pdf':
        if check_pdf_type(path) == 'text':
            with open(path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                return "".join([page.extract_text() for page in reader.pages])
        else:
            try:
                images = convert_from_path(path)
                return "".join([pytesseract.image_to_string(img) for img in images])
            except: return ""
    elif ext in ['docx', 'doc']:
        doc = docx.Document(path)
        return "\n".join([para.text for para in doc.paragraphs])
    elif ext == 'txt':
        with open(path, 'r', encoding='utf-8') as f: return f.read()
    return ""

def classify_with_confidence(text):
    if not text.strip():
        return {"case_nature": "Other", "categories": ["Empty_File"], "confidence": 0.0}

    prompt = f"""Analyze this legal text (max 3000 chars):
    {text[:3000]}
    
    Classify as Criminal or Civil, and provide categories.
    Also provide a 'confidence' score (0.0 to 1.0).
    
    Return ONLY JSON:
    {{
      "case_nature": "Criminal/Civil/Both",
      "categories": ["Cat1"],
      "confidence": 0.95
    }}
    """
    try:
        response = llm.generate_content(prompt)
        cleaned = response.text.replace('```json', '').replace('```', '').strip()
        
        # --- FIX IS HERE: json.loads (string) instead of json.load (file) ---
        return json.loads(cleaned) 
        
    except Exception as e:
        print(f">> API ERROR: {e}")
        return {"case_nature": "Other", "categories": ["API_Error"], "confidence": 0.0}

def evaluate_classification_system():
    print("\n" + "="*60)
    print("EVALUATING CLASSIFICATION MODULE")
    print("="*60)
    
    docs_folder = 'docs'
    y_true_nature = []
    y_pred_nature = []
    confidences = []
    
    test_files = [f for f in os.listdir(docs_folder) if f in GROUND_TRUTH]
    
    if not test_files:
        print(f"No files match GROUND_TRUTH. Checked: {docs_folder}")
        return

    print(f"Testing {len(test_files)} documents...")
    
    for filename in test_files:
        text = get_document_text(os.path.join(docs_folder, filename))
        prediction = classify_with_confidence(text)
        truth = GROUND_TRUTH[filename]
        
        y_true_nature.append(truth['nature'])
        y_pred_nature.append(prediction['case_nature'])
        confidences.append(prediction.get('confidence', 0))
        
        # Handle Category List safely
        pred_cat = prediction['categories'][0] if prediction['categories'] else "None"
        
        print(f"File: {filename}")
        print(f"  True: {truth['nature']} ({truth['category']})")
        print(f"  Pred: {prediction['case_nature']} ({pred_cat}) [Conf: {prediction.get('confidence',0):.2f}]")

    if y_true_nature:
        acc = accuracy_score(y_true_nature, y_pred_nature)
        avg_conf = sum(confidences) / len(confidences) if confidences else 0
        
        print("\n" + "="*60)
        print(f"Accuracy:        {acc:.2%}")
        print(f"Avg Confidence:  {avg_conf:.4f}")

evaluate_classification_system()

# --- STORAGE ---
print("\n" + "="*60)
target_file = input("Enter file to store (or press Enter to skip): ").strip()
if target_file:
    full_path = os.path.join('docs', target_file)
    if os.path.exists(full_path):
        text = get_document_text(full_path)
        data = classify_with_confidence(text)
        
        base = 'output/case_docs'
        nat = data.get('case_nature', 'Other')
        cats = data.get('categories', ['Other'])
        cat = cats[0] if isinstance(cats, list) and cats else str(cats)
        
        dest = os.path.join(base, nat, cat)
        os.makedirs(dest, exist_ok=True)
        shutil.copy2(full_path, os.path.join(dest, target_file))
        print(f"Stored in: {dest}")
        print(f"Data: {data}")
    else:
        print("File not found.")


EVALUATING CLASSIFICATION MODULE
Testing 1 documents...
File: Naveen_Singh_Naveen_Prasad_Singh_vs_State_Of_Bihar_Anr_on_21_June_2017.PDF
  True: Criminal (Murder)
  Pred: Criminal (Criminal Law) [Conf: 1.00]

Accuracy:        100.00%
Avg Confidence:  1.0000



Enter file to store (or press Enter to skip):  Naveen_Singh_Naveen_Prasad_Singh_vs_State_Of_Bihar_Anr_on_21_June_2017.PDF


Stored in: output/case_docs\Criminal\Criminal Appeal
Data: {'case_nature': 'Criminal', 'categories': ['Criminal Appeal', 'Criminal Revision', 'Criminal Law'], 'confidence': 1.0}
