In [4]:
# ==========================================
# BLOCK 1: TRAINING & LOO EVALUATION
# ==========================================

import os
import json
import re
from collections import defaultdict
import PyPDF2
import docx
from pdf2image import convert_from_path
import pytesseract
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# --- CONFIGURATION ---
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

ACT_PATTERNS = {
    'IPC': [
        r'Section\s+(\d+[A-Z]*)\s+(?:of\s+)?(?:the\s+)?(?:Indian\s+Penal\s+Code|IPC)',
        r'(?:IPC|Indian\s+Penal\s+Code)\s+(?:Section\s+)?(\d+[A-Z]*)',
        r'(?:u/s|under\s+section|read\s+with)\s+(\d+[A-Z]*)\s+(?:of\s+)?(?:the\s+)?IPC',
        r'(\d+[A-Z]*)\s+(?:IPC|Indian\s+Penal\s+Code)'
    ],
    'CrPC': [
        r'Section\s+(\d+[A-Z]*)\s+(?:of\s+)?(?:the\s+)?(?:Cr\.?P\.?C\.?|CrPC|Criminal\s+Procedure\s+Code)',
        r'(?:Cr\.?P\.?C\.?|CrPC)\s+(?:Section\s+)?(\d+[A-Z]*)'
    ],
    'Evidence_Act': [
        r'Section\s+(\d+[A-Z]*)\s+(?:of\s+)?(?:the\s+)?Evidence\s+Act',
        r'(\d+[A-Z]*)\s+of\s+(?:the\s+)?Evidence\s+Act'
    ],
    'Arbitration_Act': [
        r'Section\s+(\d+[A-Z]*(?:\(\d+\))?)\s+(?:of\s+)?(?:the\s+)?(?:ACA|Arbitration\s+(?:and\s+Conciliation\s+)?Act(?:,\s*1996)?)',
        r'(\d+[A-Z]*(?:\(\d+\))?)\s+of\s+(?:the\s+)?(?:ACA|Arbitration\s+Act)'
    ],
    'Contract_Act': [
        r'Section\s+(\d+[A-Z]*)\s+(?:of\s+)?(?:the\s+)?(?:ICA|Indian\s+Contract\s+Act(?:,\s*1872)?)',
        r'(\d+[A-Z]*)\s+of\s+(?:the\s+)?(?:ICA|Indian\s+Contract\s+Act)'
    ],
    'Partnership_Act': [
        r'Section\s+(\d+[A-Z]*(?:\([a-z0-9]+\))?)\s+(?:of\s+)?(?:the\s+)?(?:Indian\s+)?Partnership\s+Act(?:,\s*1932)?',
        r'(\d+[A-Z]*(?:\([a-z0-9]+\))?)\s+of\s+(?:the\s+)?Partnership\s+Act'
    ]
}

def check_pdf_type(path):
    try:
        with open(path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = ""
            for page in reader.pages[:2]:
                text += page.extract_text()
            return 'text' if len(text.strip()) >= 100 else 'scanned'
    except:
        return 'scanned'

def get_document_text(path):
    ext = path.lower().split('.')[-1]
    if ext == 'pdf':
        if check_pdf_type(path) == 'text':
            with open(path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                return "".join([page.extract_text() for page in reader.pages])
        else:
            images = convert_from_path(path)
            return "".join([pytesseract.image_to_string(img) for img in images])
    elif ext in ['docx', 'doc']:
        doc = docx.Document(path)
        return "\n".join([para.text for para in doc.paragraphs])
    elif ext == 'txt':
        with open(path, 'r', encoding='utf-8') as f:
            return f.read()
    return ""

def extract_context_snippet(text, match_pos, window=250):
    start = max(0, match_pos - window)
    end = min(len(text), match_pos + window)
    snippet = text[start:end].strip()
    sentences = re.split(r'[.!?]\s+', snippet)
    relevant = [s.strip() for s in sentences if len(s.strip()) > 30]
    if len(relevant) >= 2: return " ".join(relevant[:2])
    return relevant[0] if relevant else snippet[:200]

def extract_sections_with_regex(text):
    sections_found = defaultdict(list)
    for act_name, patterns in ACT_PATTERNS.items():
        for pattern in patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                try:
                    section_num = re.sub(r'[^\d\w]', '', match.group(1))
                    if not section_num or not section_num[0].isdigit(): continue
                    match_pos = match.start()
                    context = extract_context_snippet(text, match_pos)
                    sections_found[f"{act_name}_{section_num}"].append({
                        'position': match_pos, 'context': context, 'matched_text': match.group(0)
                    })
                except: continue
    return sections_found

def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512, padding=True)
    with torch.no_grad(): outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# --- TRAINING EXECUTION ---
docs_folder = 'docs'
doc_files = [f for f in os.listdir(docs_folder) if f.endswith(('.pdf', '.PDF', '.docx', '.doc', '.txt'))]
print(f"Found {len(doc_files)} documents for training.\n")

doc_sections_map = {}
all_contexts = defaultdict(list)
all_sections_data = {}

for idx, filename in enumerate(doc_files, 1):
    text = get_document_text(os.path.join(docs_folder, filename))
    sections_found = extract_sections_with_regex(text)
    if sections_found:
        doc_sections_map[filename] = sections_found
        for section_key, occurrences in sections_found.items():
            all_sections_data[section_key] = all_sections_data.get(section_key, []) + occurrences
            for occ in occurrences: all_contexts[section_key].append(occ['context'])
    print(f"Processed {idx}/{len(doc_files)}: {filename}")

print("\nComputing Embeddings...")
embeddings = {k: get_sentence_embedding(" ".join([o['context'] for o in v][:5])) for k, v in all_sections_data.items()}

print("Building Hybrid Matrix...")
explicit_cooccur = defaultdict(lambda: defaultdict(int))
for doc_name, sections_data in doc_sections_map.items():
    keys = list(sections_data.keys())
    for i, sec1 in enumerate(keys):
        for sec2 in keys[i+1:]:
            explicit_cooccur[sec1][sec2] += 1
            explicit_cooccur[sec2][sec1] += 1

semantic_sim = defaultdict(dict)
keys = list(embeddings.keys())
for i, sec1 in enumerate(keys):
    for sec2 in keys[i+1:]:
        sim = cosine_similarity(embeddings[sec1], embeddings[sec2])[0][0]
        if sim >= 0.7:
            semantic_sim[sec1][sec2] = round(float(sim), 3)
            semantic_sim[sec2][sec1] = round(float(sim), 3)

hybrid_cooccurrence = {}
all_keys = set(list(explicit_cooccur.keys()) + list(semantic_sim.keys()) + list(all_contexts.keys()))

for section in all_keys:
    explicit = {k: v for k, v in explicit_cooccur.get(section, {}).items() if v >= 2}
    semantic = semantic_sim.get(section, {})
    contexts = list(set(all_contexts.get(section, [])))[:5]
    if explicit or semantic or contexts:
        hybrid_cooccurrence[section] = {
            'explicit_cooccurrence': explicit,
            'semantic_cooccurrence': semantic,
            'contexts': contexts
        }

os.makedirs('output', exist_ok=True)
with open('output/hybrid_cooccurrence.json', 'w') as f:
    json.dump(hybrid_cooccurrence, f, indent=2)

# --- LOO EVALUATION METRICS ---
def evaluate_recommender(doc_map, hybrid_db):
    print("\n" + "="*60)
    print("RUNNING RECOMMENDER METRICS (LOO)")
    print("="*60)
    hits_at_1 = 0
    hits_at_5 = 0
    total_tests = 0
    reciprocal_ranks = []
    
    for doc_file, sections_data in doc_map.items():
        actual_sections = list(sections_data.keys())
        if len(actual_sections) < 2: continue
            
        for i in range(len(actual_sections)):
            hidden = actual_sections[i]
            inputs = actual_sections[:i] + actual_sections[i+1:]
            
            suggestions = {}
            for inp in inputs:
                if inp in hybrid_db:
                    for rel, count in hybrid_db[inp].get('explicit_cooccurrence', {}).items():
                        suggestions[rel] = suggestions.get(rel, 0) + (count * 1.0)
                    for rel, score in hybrid_db[inp].get('semantic_cooccurrence', {}).items():
                        suggestions[rel] = suggestions.get(rel, 0) + (score * 0.5)
            
            for inp in inputs:
                if inp in suggestions: del suggestions[inp]
                
            top_preds = [x[0] for x in sorted(suggestions.items(), key=lambda x: x[1], reverse=True)]
            total_tests += 1
            
            if top_preds:
                if top_preds[0] == hidden: hits_at_1 += 1
                if hidden in top_preds[:5]:
                    hits_at_5 += 1
                    reciprocal_ranks.append(1/(top_preds.index(hidden) + 1))
                else: reciprocal_ranks.append(0)
            else: reciprocal_ranks.append(0)

    if total_tests > 0:
        print(f"Precision@1: {hits_at_1 / total_tests:.2%}")
        print(f"Recall@5:    {hits_at_5 / total_tests:.2%}")
        print(f"MRR:         {np.mean(reciprocal_ranks):.4f}")
    else:
        print("Not enough data to evaluate.")

evaluate_recommender(doc_sections_map, hybrid_cooccurrence)

Found 20 documents for training.

Processed 1/20: Abhijeet_Raj_vs_State_Govt_Of_Nct_Of_Delhi_on_27_April_2016.PDF
Processed 2/20: Anbazhagan_vs_The_State_Rep_By_The_Inspector_Of_Police_on_20_July_2023.PDF
Processed 3/20: Anda_And_Ors_vs_The_State_Of_Rajasthan_on_9_March_1965.PDF
Processed 4/20: Ashapura_Minechem_Ltd_vs_Indian_Bureau_Of_Mines_Thr_Minstry_And_on_6_June_2025.PDF
Processed 5/20: Board_Of_Control_For_Cricket_In_India_vs_M_S_Rendezvous_Sports_World_And_6_on_17_June_2025.PDF
Processed 6/20: Ibra_Akanda_And_Ors_vs_Emperor_on_8_February_1944.PDF
Processed 7/20: Jayrambhai_Panchiyabhai_Gamit_vs_State_Of_Gujarat_on_7_March_2017.PDF
Processed 8/20: Naredco_West_Foundation_vs_Citispace_And_6_Ors_on_19_June_2025.PDF
Processed 9/20: Ngo_Alliance_For_Governance_And_vs_State_Of_Maharashtra_And_Ors_on_19_June_2025.PDF
Processed 10/20: Nusli_N_Wadia_And_2_Ors_vs_Bastion_Constructions_on_27_June_2025.PDF
Processed 11/20: Pandurang_D_Chalke_And_Anr_vs_Citispace_And_Ors_on_19_June_2025.PDF
