In [26]:
import os
import fitz
import json
import re
import tempfile
from io import BytesIO, StringIO
from PIL import Image
import docx2txt
from collections import Counter
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
import spacy
from google.cloud import vision
import ipywidgets as widgets
from IPython.display import display

# Set your Google Cloud Vision credentials path
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "vision-key.json"

# Load NLP and Vision models
vision_client = vision.ImageAnnotatorClient()
nlp = spacy.load("en_core_web_sm")


In [27]:
def preprocess_pixmap(pixmap):
    img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
    img = img.convert("L")
    img = img.point(lambda x: 0 if x < 140 else 255, '1')
    buffer = BytesIO()
    img.save(buffer, format="PNG")
    return buffer.getvalue()

def google_ocr_from_pixmap(pixmap):
    content = preprocess_pixmap(pixmap)
    image = vision.Image(content=content)
    context = vision.ImageContext(language_hints=["en", "hi"])
    response = vision_client.document_text_detection(image=image, image_context=context)
    if not response.full_text_annotation.text:
        response = vision_client.text_detection(image=image, image_context=context)
    if response.error.message:
        raise Exception(f"Vision API error: {response.error.message}")
    return response.full_text_annotation.text.strip() if response.full_text_annotation.text else ""


In [28]:
def extract_text_from_pdf(file_bytes):
    text = ""
    log = []
    with fitz.open(stream=file_bytes.read(), filetype="pdf") as doc:
        for i, page in enumerate(doc):
            extracted = page.get_text()
            if extracted.strip():
                text += extracted
                log.append(f"Page {i+1}: Text extracted.")
            else:
                pix = page.get_pixmap(dpi=300)
                ocr_text = google_ocr_from_pixmap(pix)
                text += ocr_text
                log.append(f"Page {i+1}: OCR used.")
    return text, log

def extract_text_from_docx(file_bytes):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
        tmp.write(file_bytes.read())
        return docx2txt.process(tmp.name), []

def extract_text_from_txt(file_bytes):
    return file_bytes.read().decode("utf-8"), []

def extract_text(file_bytes, file_name):
    if file_name.endswith('.pdf'):
        return extract_text_from_pdf(file_bytes)
    elif file_name.endswith('.docx'):
        return extract_text_from_docx(file_bytes)
    elif file_name.endswith('.txt'):
        return extract_text_from_txt(file_bytes)
    return "Unsupported file format", []


In [29]:
def generate_metadata(text):
    def safe_lang_detect(txt):
        try: return detect(txt)
        except LangDetectException: return "Unknown"
    return {
        "character_count": len(text),
        "word_count": len(text.split()),
        "line_count": len(text.splitlines()),
        "starts_with": text[:50] + "..." if len(text) > 50 else text,
        "ends_with": text[-50:] + "..." if len(text) > 50 else text,
        "language": safe_lang_detect(text)
    }

def guess_title(text):
    for line in text.splitlines():
        if len(line.split()) <= 10 and (line.isupper() or line.istitle()):
            return line.strip()
    return text.splitlines()[0] if text else "Unknown"

def extract_key_sentences(text, count=5):
    sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 20]
    common = Counter(re.findall(r'\w+', text.lower())).most_common(50)
    common_words = set(w for w, _ in common)
    return sorted(sentences, key=lambda s: sum(w in common_words for w in s.lower().split()), reverse=True)[:count]

def extract_named_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE", "DATE", "EVENT"]]


In [30]:
upload = widgets.FileUpload(accept='.pdf,.docx,.txt', multiple=False)
display(upload)


FileUpload(value=(), accept='.pdf,.docx,.txt', description='Upload')

In [31]:
# Run this cell after uploading a file above
if upload.value:
    file_info = upload.value[0]  # Access the first file in the tuple
    file_bytes = BytesIO(file_info['content'])
    file_name = file_info['name']

    text, log = extract_text(file_bytes, file_name)
    metadata = generate_metadata(text)
    title = guess_title(text)
    key_sentences = extract_key_sentences(text)
    entities = extract_named_entities(text)

    print(f"\nüìÑ Title: {title}")
    print(f"üó£Ô∏è Language: {metadata['language']}")
    print(f"üî† Characters: {metadata['character_count']}, Words: {metadata['word_count']}, Lines: {metadata['line_count']}")
    print("\nüìù Summary:")
    print(f"This document, titled \"{title}\" and written in {metadata['language']}, contains about {metadata['word_count']} words.\n")

    print("Key Sentences:")
    for s in key_sentences:
        print(f"- {s}")

    if entities:
        print("\nNamed Entities:")
        for ent, label in entities:
            print(f"- {ent} ({label})")
    else:
        print("\nNo named entities found.")
else:
    print("üìÅ Please upload a file in the cell above.")



üìÑ Title: Acknowledgement Number:572115360251223
üó£Ô∏è Language: en
üî† Characters: 3377, Words: 504, Lines: 182

üìù Summary:
This document, titled "Acknowledgement Number:572115360251223" and written in en, contains about 504 words.

Key Sentences:
- 77
and veriÔ¨Åed by 
SURESH KUMAR
 having PAN 
GSBPK1192Q
 on 
25-Dec-2023
 using
paper ITR-VeriÔ¨Åcation Form /Electronic VeriÔ¨Åcation Code 
74E89W79PI
 generated through 
Aadhaar OTP
mode
System Generated 
Barcode/QR Code
GSBPK1192Q02572115360251223e244f1f5e21054952f9b8c8a1db531aab268541a
‚ÄÇDO NOT SEND THIS ACKNOWLEDGEMENT TO CPC, BENGALURU ‚ÄÇ
Taxable Income and Tax Details
Accreted Income and Tax Detail
Acknowledgement Number:569105300231223
Date of filing : 23-Dec-2023*
INDIAN INCOME TAX RETURN ACKNOWLEDGEMENT
[Where the data of the Return of Income in Form ITR-1(SAHAJ), ITR-2, ITR-3, ITR-4(SUGAM), ITR-5, ITR-6, ITR-7
filed and verified]
(Please see Rule 12 of the Income-tax Rules, 1962)
Assessment
Year
2023-24
PAN
JIGPD098