In [None]:
!pip install python-docx spacy pandas tqdm

In [None]:
!pip install ollama

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
import subprocess
subprocess.Popen(["ollama", "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

<Popen: returncode: None args: ['ollama', 'serve']>

In [None]:
!ollama pull gpt-oss:20b

[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026

In [None]:
import zipfile
import os

zip_file_path = "/content/Tn_2nd_500_text_miss.zip"
extract_dir = "/content"

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
    for member in zip_ref.infolist():
        try:
            zip_ref.extract(member, extract_dir)
        except Exception as e:
            print(f"⚠️ Failed to extract {member.filename}: {e}")

print(f"✅ '{zip_file_path}' extracted successfully to '{extract_dir}'")

✅ '/content/Tn_2nd_500_text_miss.zip' extracted successfully to '/content'


In [None]:
import os
import re
from docx import Document
from tqdm import tqdm

# === CONFIGURATION ===
INPUT_FOLDER = r"/content/Tn_2nd_500_text"  # folder containing .docx
OUTPUT_FOLDER = r"/content/Tn_2nd_500_text_texts"  # folder to save extracted text

# Create output folder if it doesn't exist
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Regex to detect PART lines (Roman numerals)
PART_LINE_RE = re.compile(
    r"^\s*PART\s*[-\s]*([IVX]+)\b.*$",
    re.IGNORECASE,
)

def extract_until_part2(doc_path):
    """
    Extract text from the start of the document up to PART II.
    """
    try:
        doc = Document(doc_path)
    except Exception as e:
        print(f"⚠️ Could not read {doc_path}: {e}")
        return ""

    text_lines = []

    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue

        match = PART_LINE_RE.match(text)
        if match:
            part_number = match.group(1).upper()
            if part_number == "II":
                break  # stop at PART II

        text_lines.append(text)

    return "\n".join(text_lines).strip()

def process_folder(input_folder, output_folder):
    files = [f for f in os.listdir(input_folder) if f.lower().endswith(".docx") and not f.startswith("~$")]

    for fname in tqdm(files, desc="Extracting text until PART II"):
        fpath = os.path.join(input_folder, fname)
        text = extract_until_part2(fpath)

        if text:
            out_name = os.path.splitext(fname)[0] + ".txt"
            out_path = os.path.join(output_folder, out_name)

            with open(out_path, "w", encoding="utf-8") as f:
                f.write(text)
        else:
            print(f"⚠️ No text extracted from {fname}")

# === RUN ===
process_folder(INPUT_FOLDER, OUTPUT_FOLDER)
print(f"\n✅ Extraction complete. Files saved in: {OUTPUT_FOLDER}")


In [None]:
import os
import re
import json
import pandas as pd
from tqdm import tqdm
import ollama

# ====== CONFIGURATION ======
MY_TEXT_FOLDER = r"/content/Tn_2nd_500_text_texts"  # Folder with .txt files
RESULTS_FILE = r"/content/results.xlsx"  # Output Excel file
MODEL = "gpt-oss:20b"  # Ollama model
STOP_LIMIT = None

# ---------- UTILITIES ----------
def load_text_file(file_path):
    """Safely load text from a .txt file."""
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    except Exception as e:
        print(f"⚠️ Skipping invalid file: {file_path} ({e})")
        return ""

def clean_location(location):
    """Clean up location names to avoid full addresses."""
    if not location:
        return ""
    if "," in location:
        return location.split(",")[-1].strip()
    location = re.sub(
        r"\b(highway|road|street|main|lane)\b.*", "", location, flags=re.IGNORECASE
    ).strip()
    return location


def get_prompt():
    return (
"""Extract 3 fields from this IR content:

- state: full Indian state name.
- location: town/city/taluka (short, no full address or PIN).
- department: ultimate parent administrative department, always ending with "Department".
  - Railways: include sub-department like "Indian Railways - Mechanical Department".
  - Local offices (e.g., Tahsildar): pick the parent department (e.g., "Revenue Department").
  - Do NOT return offices, posts, banks, audit offices, ministries, schemes, or intermediate bodies.
  - Normalize ministries to departments (e.g., Ministry of Finance → Finance Department).
  - Infer the parent department if not explicitly mentioned.

Output strictly as JSON: {"state":"...","location":"...","department":"..."} with no extra text.
"""
    )



def analyze_ir_content(doc_text):
    """Send text to the model and return parsed results (State, Location, Department)."""
    if not doc_text.strip():
        return "", "", ""

    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": get_prompt()},
            {"role": "user", "content": doc_text},
        ]
    )

    content = response["message"]["content"].strip()

    try:
        parsed = json.loads(content)
        state = parsed.get("state", "").strip()
        location = clean_location(parsed.get("location", "").strip())
        department = parsed.get("department", "").strip()
    except Exception as e:
        print("⚠️ JSON parse error:", e)
        print("Raw output:", content[:200])
        state, location, department = "", "", content

    return state, location, department

# ---------- MAIN PROCESS ----------
def main_process():
    rows = []
    files = [f for f in os.listdir(MY_TEXT_FOLDER) if f.lower().endswith(".txt") and not f.startswith("~$")]

    if STOP_LIMIT is not None:
        files = files[:STOP_LIMIT]  # apply limit if set

    for fname in tqdm(files, desc="Processing text files"):
        fpath = os.path.join(MY_TEXT_FOLDER, fname)
        text = load_text_file(fpath)

        if not text.strip():
            continue

        state, location, department = analyze_ir_content(text)

        rows.append({
            "Folder Name": os.path.basename(MY_TEXT_FOLDER),
            "Filename": fname,
            "State": state,
            "Location": location,
            "Department": department,
        })

    df = pd.DataFrame(rows)
    df.to_excel(RESULTS_FILE, index=False, engine="openpyxl")

    print(f"✅ Processing complete. Results saved in {RESULTS_FILE}")

# ---------- RUN ----------
if __name__ == "__main__":
    main_process()


Processing text files: 100%|██████████| 50/50 [10:46<00:00, 12.93s/it]


✅ Processing complete. Results saved in /content/results.xlsx


In [None]:
import os
import re
import json
import pandas as pd
from tqdm import tqdm
import ollama

# === CONFIG ===
INPUT_FOLDER = r"/content/Tn_2nd_500_text"
OUTPUT_FILE = r"/content/results1.xlsx"
MODEL = "gpt-oss:20b"

# === REGEX HELPERS ===
DATE_RANGE_RE = re.compile(
    r"(\d{2}[-/]\d{2}[-/]\d{4})\s*(?:to|-)\s*(\d{2}[-/]\d{2}[-/]\d{4})",
    re.IGNORECASE
)
YEAR_RANGE_RE = re.compile(r"(20\d{2}\s*[-/]\s*20\d{2})")

def extract_regex(text):
    """Try regex first to extract audit dates and financial year text as-is."""
    date_of_audit, financial_year = None, None

    # Strict DD-MM-YYYY to DD-MM-YYYY
    m = DATE_RANGE_RE.search(text)
    if m:
        date_of_audit = f"{m.group(1).replace('/', '-')} to {m.group(2).replace('/', '-')}"

    # Financial year: return the exact text (2020-2021, 2019/20, etc.)
    y = YEAR_RANGE_RE.search(text)
    if y:
        financial_year = y.group(1).replace("/", "-").strip()

    return date_of_audit, financial_year

# === LLM PROMPT (fallback only) ===
def get_prompt():
    return (
"""Extract 2 fields from this IR text:

- date_of_audit → actual audit date(s). Format strictly "DD-MM-YYYY to DD-MM-YYYY" or "YYYY-YYYY". If missing, return "Not Found".
- financial_year → return the exact reporting/financial year phrase as it appears in text (e.g., "2019-2020", "April 2018 to March 2019"). If missing, return "Not Found".

Output only JSON:
{"date_of_audit":"...","financial_year":"..."}
"""
    )

def analyze_with_llm(text):
    """Use model only if regex fails or is missing."""
    resp = ollama.chat(
        model=MODEL,
        messages=[{"role":"system","content":get_prompt()},
                  {"role":"user","content":text}]
    )
    out = resp["message"]["content"].strip()
    try:
        data = json.loads(out)
        return (
            data.get("date_of_audit", "Not Found"),
            data.get("financial_year", "Not Found"),
        )
    except:
        return "Not Found", "Not Found"

def analyze(text):
    if not text.strip():
        return "Not Found", "Not Found"

    date_of_audit, financial_year = extract_regex(text)

    # Fall back if not found
    if not date_of_audit or not financial_year:
        da, fy = analyze_with_llm(text)
        if not date_of_audit:
            date_of_audit = da
        if not financial_year:
            financial_year = fy

    return date_of_audit or "Not Found", financial_year or "Not Found"

# === MAIN LOOP ===
def main():
    rows = []
    files = [f for f in os.listdir(INPUT_FOLDER) if f.endswith(".txt")]

    for f in tqdm(files, desc="Processing"):
        path = os.path.join(INPUT_FOLDER, f)
        with open(path, "r", encoding="utf-8") as fh:
            text = fh.read()

        date_audit, fin_year = analyze(text)

        rows.append({
            "Filename": f,
            "Date of Audit": date_audit,
            "Financial Year": fin_year
        })

        # Save after each file
        pd.DataFrame(rows).to_excel(OUTPUT_FILE, index=False)

    print("✅ Done. Results saved in:", OUTPUT_FILE)

if __name__ == "__main__":
    main()


Processing: 100%|██████████| 24/24 [05:44<00:00, 14.35s/it]

✅ Done. Results saved in: /content/results.xlsx



