In [1]:
!pip install spacy pytesseract pdf2image python-docx PyMuPDF tika --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m69.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Required Libraries

In [1]:
import fitz
import pytesseract
from pdf2image import convert_from_path
from docx import Document
from collections import Counter
import spacy
import json
import requests
from google.colab import files
import os
import time


# Extracts text from uploaded documents (PDF, DOCX, or TXT)

In [2]:
def extract_text(file_path):
    ext = file_path.split('.')[-1].lower()
    if ext == 'pdf':
        doc = fitz.open(file_path)
        text = ''.join([page.get_text() for page in doc])
        if not text.strip():
            images = convert_from_path(file_path)
            text = "\n".join([pytesseract.image_to_string(img) for img in images])
        return text
    elif ext == 'docx':
        return "\n".join([p.text for p in Document(file_path).paragraphs])
    elif ext == 'txt':
        return open(file_path, 'r', encoding='utf-8').read()
    else:
        return ""

# Extract Metadata using spaCy (Keywords, Entities, Dates)

In [3]:
def extract_spacy_metadata(text):
    doc = nlp(text)

    # Keywords
    words = [token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop]
    keywords = [word for word, _ in Counter(words).most_common(7)]

    # Named Entities
    people = list(set([ent.text for ent in doc.ents if ent.label_ == "PERSON"]))
    orgs = list(set([ent.text for ent in doc.ents if ent.label_ == "ORG"]))
    dates = list(set([ent.text for ent in doc.ents if ent.label_ == "DATE"]))

    return {
        "keywords": keywords,
        "people": people,
        "organizations": orgs,
        "dates": dates
    }


#LLM-Based Summarization (Groq LLaMA 3)

In [6]:
GROQ_API_KEY = "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"

In [4]:
def generate_summary_with_groq(text):
    prompt = f"""
You are a summarization assistant.
From the document below, extract only the summary.
The summary should be concise and no longer than 4 lines.

Document:
{text[:3000]}
"""

    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json"
    }

    data = {
        "model": "llama3-8b-8192",
        "messages": [
            {"role": "system", "content": "You are a helpful summarization assistant."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.3,
        "max_tokens": 256
    }

    response = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=headers, json=data)

    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"].strip()
    else:
        return "Summary generation failed."

# Output

In [8]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Upload File
uploaded = files.upload()
file_path = list(uploaded.keys())[0]

# Extract Text
text = extract_text(file_path)

# File Info
file_name = os.path.basename(file_path)
creation_time = time.ctime(os.path.getctime(file_path))

# Generate Metadata
summary = generate_summary_with_groq(text)
spacy_data = extract_spacy_metadata(text)

metadata = {
    "file_name": file_name,
    "creation_date": creation_time,
    "summary": summary,
    **spacy_data
}

output_json_path = file_name.rsplit(".", 1)[0] + "_metadata.json"

with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)

# Display Metadata
print("\n" + "=" * 80)
print("📄 Document Metadata")
print("=" * 80)
print(json.dumps(metadata, indent=2))


Saving Nipah Virus, pdf.pdf to Nipah Virus, pdf (2).pdf

📄 Document Metadata
{
  "file_name": "Nipah Virus, pdf (2).pdf",
  "creation_date": "Tue Jun 24 18:37:53 2025",
  "summary": "Here is a concise summary of the document in 4 lines:\n\nNipah virus is a bat-borne virus that causes high mortality rate (40-75%) in humans and animals. It is a biosafety level-4 pathogen and belongs to the genus Henipavirus. Fruit bats are the natural hosts of the virus, and transmission occurs through intermediate hosts such as horses. A vaccine, Oxford/AstraZeneca, has been developed to combat the virus.",
  "keywords": [
    "virus",
    "nipah",
    "dna",
    "outbreak",
    "shell",
    "bat",
    "infection"
  ],
  "people": [
    "NiV",
    "Transmission Reservoirs\nGlobal"
  ],
  "organizations": [
    "RNA",
    "Oxford/AstraZeneca",
    "\u2022 Fruit",
    "\u2022 Virus",
    "\u2022 Spike",
    "Nipah",
    "\u2022 Synthetic",
    "\u2022 Nipah"
  ],
  "dates": [
    "2007",
    "2023",
    "