In [None]:
!pip install python-docx spacy pandas tqdm

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0


In [None]:
!pip install ollama

Collecting ollama
  Downloading ollama-0.6.0-py3-none-any.whl.metadata (4.3 kB)
Downloading ollama-0.6.0-py3-none-any.whl (14 kB)
Installing collected packages: ollama
Successfully installed ollama-0.6.0


In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [None]:
import subprocess
subprocess.Popen(["ollama", "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

<Popen: returncode: None args: ['ollama', 'serve']>

In [None]:
!ollama pull gpt-oss:20b

[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A

In [None]:
import zipfile
import os

zip_file_path = "/content/10k_Sample_text.zip"
extract_dir = "/content"

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
    for member in zip_ref.infolist():
        try:
            zip_ref.extract(member, extract_dir)
        except Exception as e:
            print(f"⚠️ Failed to extract {member.filename}: {e}")

print(f"✅ '{zip_file_path}' extracted successfully to '{extract_dir}'")

✅ '/content/10k_Sample_text.zip' extracted successfully to '/content'


In [None]:
import os
import re
import json
import pandas as pd
from tqdm import tqdm
import ollama

# ===== CONFIGURATION =====
MY_TEXT_FOLDER = r"/content/CollectedFiles_text1"   # Folder containing .txt files
RESULTS_FILE = r"/content/results_single_gpu.csv"               # Output CSV
MODEL = "gpt-oss:20b"                                                 # Ollama model


# ===== UTILITIES =====
def load_text_file(file_path):
    """Safely load text from a file."""
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read().strip()
    except Exception as e:
        print(f"⚠️ Skipping invalid file: {file_path} ({e})")
        return ""


def clean_location(location):
    """Simplify and clean the extracted location string."""
    if not location:
        return "Unknown"
    if "," in location:
        location = location.split(",")[-1].strip()
    location = re.sub(
        r"\b(highway|road|street|main|lane|avenue|cross|extn|extension|building|ward|bridge)\b.*",
        "",
        location,
        flags=re.IGNORECASE,
    ).strip()
    return location if location else "Unknown"


def get_prompt():
    """Return the user-defined prompt (optimized for clarity and accuracy)."""
    return (
        """You are an expert IR analyst.

Given an Inspection Report (IR), extract:

1. **location** → Most specific identifiable official place of the audited office (town, city, taluka, or locality).
   - Include small/local areas if mentioned (e.g., Palladam, Hosur, Ambattur).
   - Do NOT include wards, buildings, bridges, or landmarks.
   - If not explicitly stated, infer the most likely location based on context or known departmental presence.

2. **state** → Full Indian state or union territory name (e.g., Tamil Nadu, Delhi).
   - Do NOT abbreviate or add extra text.

Output strictly in JSON (no explanations, no markdown):
{
  "state": "Full state name",
  "location": "Place name only"
}

Rules:
- Always include both fields.
- Avoid addresses, landmarks, or abbreviations.
- Return only the JSON."""
    )


def extract_json(content):
    """Extract and parse JSON from model response."""
    if not content:
        return None
    match = re.search(r"\{.*\}", content, re.DOTALL)
    if not match:
        return None
    try:
        return json.loads(match.group(0))
    except json.JSONDecodeError:
        return None


def analyze_ir_content(doc_text):
    """Send text to Ollama model and extract state/location."""
    if not doc_text:
        return "Unknown", "Unknown"

    try:
        response = ollama.chat(
            model=MODEL,
            messages=[
                {"role": "system", "content": get_prompt()},
                {"role": "user", "content": doc_text},
            ],
        )
        content = response.get("message", {}).get("content", "").strip()
    except Exception as e:
        print(f"⚠️ Ollama error: {e}")
        return "Error", "Error"

    parsed = extract_json(content)
    if parsed:
        state = parsed.get("state", "Unknown").strip()
        location = clean_location(parsed.get("location", "Unknown").strip())
    else:
        print("⚠️ No valid JSON found in response.")
        state, location = "Unknown", "Unknown"

    return state, location


def save_results(rows, results_file):
    """Save results to CSV (append mode)."""
    df = pd.DataFrame(rows)
    if not os.path.exists(results_file):
        df.to_csv(results_file, index=False)
    else:
        df.to_csv(results_file, mode="a", header=False, index=False)


# ===== MAIN PROCESS =====
def main_single_gpu():
    """Process all text files using a single GPU."""
    files = [f for f in os.listdir(MY_TEXT_FOLDER)
             if f.lower().endswith(".txt") and not f.startswith("~$")]
    files.sort()

    # Create CSV with headers if not exists
    if not os.path.exists(RESULTS_FILE):
        pd.DataFrame(columns=["Folder Name", "Filename", "State", "Location"]).to_csv(RESULTS_FILE, index=False)

    for fname in tqdm(files, desc="Processing text files"):
        fpath = os.path.join(MY_TEXT_FOLDER, fname)
        doc_text = load_text_file(fpath)

        state, location = analyze_ir_content(doc_text)

        row = {
            "Folder Name": os.path.basename(MY_TEXT_FOLDER),
            "Filename": fname,
            "State": state,
            "Location": location,
        }

        save_results([row], RESULTS_FILE)

    print(f"\n✅ All files processed successfully. Results saved in {RESULTS_FILE}")


# ===== RUN =====
if __name__ == "__main__":
    main_single_gpu()


Processing text files:  29%|██▊       | 10/35 [01:52<03:03,  7.32s/it]

⚠️ No valid JSON found.


Processing text files:  49%|████▊     | 17/35 [02:43<02:56,  9.82s/it]

⚠️ No valid JSON found.


Processing text files: 100%|██████████| 35/35 [04:17<00:00,  7.35s/it]

✅ Processing complete. Results saved in /content/results_state_location1.csv



