In [None]:
import pytesseract
from pdf2image import convert_from_path
from pytesseract import Output
from google.colab import files
import pandas as pd
import numpy as np
import regex as re
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

def extract_financial_numbers(text):
    raw = re.findall(r"(€|£)?\s*\d[\d,\.]*", text)
    cleaned = []
    for num in raw:
        s = str(num)
        s_clean = re.sub(r"[^\d\.]", "", s)
        if not s_clean:
            continue
        try:
            val = float(s_clean.replace(",", ""))
            cleaned.append(val) # Removed the restrictive condition
        except:
            continue
    return cleaned

year_re = re.compile(r"\b(20\d{2})\b")
def extract_years(text): return year_re.findall(text)

def extract_dates(text):
    return re.findall(r"\b\d{1,2}\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b",
                      text, flags=re.IGNORECASE)

def extract_names(text):
    names = re.findall(r"\b[A-Z][a-z]+(?:\s[A-Z][a-z]+){0,2}\b", text)
    return [n for n in names if n.lower() not in ["company","limited","report"]]

SECTION_KEYWORDS = {
    "directors_report": ["director", "director's report", "board"],
    "results": ["profit","loss","turnover","tax"],
    "assets": ["assets","tangible","stocks","debtors"],
    "liabilities": ["liabilities","creditors"],
    "equity": ["equity","share capital","reserves"],
    "notes": ["notes"],
    "accounting_records": ["accounting records"],
    "general": []
}

def classify_section(text):
    t = text.lower()
    for sec, kws in SECTION_KEYWORDS.items():
        for k in kws:
            if k in t:
                return sec
    return "general"

uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]

pages = convert_from_path(pdf_path, dpi=300)
all_text = ""
for p in pages:
    all_text += pytesseract.image_to_string(p) + "\n"

sentences = sent_tokenize(all_text)

narrative_rows = []
for s in sentences:
    s_clean = s.strip()
    sec = classify_section(s_clean)
    vals = extract_financial_numbers(s_clean)
    yrs = extract_years(s_clean)
    narrative_rows.append({
        "section": sec,
        "text": s_clean,
        "value_current": vals[0] if len(vals) > 0 else "",
        "value_prior": vals[1] if len(vals) > 1 else "",
        "names": ", ".join(extract_names(s_clean)),
        "dates": ", ".join(extract_dates(s_clean))
    })

df_narrative = pd.DataFrame(narrative_rows)

structured_rows = []
for p in pages:
    d = pytesseract.image_to_data(p, output_type=Output.DATAFRAME).dropna(subset=["text"])
    lines = d.groupby(["block_num","par_num","line_num"])["text"].apply(lambda x: " ".join(list(x)))
    for line in lines:
        nums = extract_financial_numbers(line)
        yrs = extract_years(line)
        # Moved entry creation outside the 'if len(nums) >= 1:' block
        entry = {
            "section": classify_section(line),
            "line_item": re.sub(r"\s+\d.*", "", line).strip(),
            "value_current": nums[0] if len(nums) > 0 else "",
            "value_prior": nums[1] if len(nums) > 1 else "",
        }
        for i,y in enumerate(yrs):
            entry[f"year_{y}"] = nums[i] if i < len(nums) else ""
        structured_rows.append(entry)

df_structured = pd.DataFrame(structured_rows).fillna("")

# ---------- AUTO SUMMARY ----------
summary = {}

def grab(label):
    # Ensure df_structured is not empty before attempting to access columns
    if df_structured.empty or 'line_item' not in df_structured.columns:
        return None
    matches = df_structured[df_structured["line_item"].str.contains(label, case=False, na=False)]
    if len(matches) > 0:
        return matches["value_current"].iloc[0]
    return None

summary["profit"] = grab("profit")
summary["turnover"] = grab("turnover")
summary["tangible_assets"] = grab("tangible")
summary["stocks"] = grab("stocks")
summary["debtors"] = grab("debtors")
summary["creditors_lt1"] = grab("falling due within")
summary["creditors_gt1"] = grab("after more than")
summary["net_assets"] = grab("net assets")
summary["total_assets"] = grab("total assets")

print("\n========== AUTO-GENERATED SUMMARY ==========\n")

for k,v in summary.items():
    if v is not None:
        print(f"{k.replace('_',' ').title()}: €{v:,.2f}")

print("\n(All values extracted automatically.)\n")

# ---------- CREATE EXCEL ----------
with pd.ExcelWriter("financial_output.xlsx") as writer:
    df_narrative.to_excel(writer, sheet_name="Narrative", index=False)
    df_structured.to_excel(writer, sheet_name="Tables", index=False)
    pd.DataFrame(summary.items(), columns=["metric","value"]).to_excel(writer, sheet_name="Summary", index=False)

files.download("financial_output.xlsx")

print("\nExcel file generated: financial_output.xlsx\n")

'apt' is not recognized as an internal or external command,
operable program or batch file.

[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: C:\Users\oisin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


ModuleNotFoundError: No module named 'google'