In [None]:
!pip install python-docx pdfminer.six pytesseract pillow pdfplumber
!pip install keybert sentence-transformers transformers spacy
!python -m spacy download en_core_web_sm


Collecting python-docx
  Obtaining dependency information for python-docx from https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl.metadata
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting pdfminer.six
  Obtaining dependency information for pdfminer.six from https://files.pythonhosted.org/packages/73/16/7a432c0101fa87457e75cb12c879e1749c5870a786525e2e0f42871d6462/pdfminer_six-20250506-py3-none-any.whl.metadata
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pytesseract
  Obtaining dependency information for pytesseract from https://files.pythonhosted.org/packages/7a/33/8312d7ce74670c9d39a532b2c246a853861120486be9443eebf048043637/pytesseract-0.3.13-py3-none-any.whl.metadata
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdfplumber
  Obtaining dependency information for pdfplumber from https://files.pythonh

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
s3fs 2023.4.0 requires fsspec==2023.4.0, but you have fsspec 2025.5.1 which is incompatible.


In [2]:
import os
import pdfplumber
import pytesseract
from PIL import Image
import docx
import spacy
from sentence_transformers import SentenceTransformer, util
from keybert import KeyBERT
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM


In [5]:
def extract_text_pdf_with_ocr(path):
    text = ""
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
            else:
                page_image = page.to_image(resolution=300)
                ocr_text = pytesseract.image_to_string(page_image.original)
                text += ocr_text + "\n"
    return text

def extract_text_docx(path):
    doc = docx.Document(path)
    return '\n'.join([p.text for p in doc.paragraphs])

def extract_text_txt(path):
    with open(path, 'r', encoding='utf-8') as file:
        return file.read()

def extract_text_image(path):
    image = Image.open(path)
    return pytesseract.image_to_string(image)

def extract_text(path):
    ext = os.path.splitext(path)[-1].lower()
    if ext == '.pdf':
        return extract_text_pdf_with_ocr(path)
    elif ext == '.docx':
        return extract_text_docx(path)
    elif ext == '.txt':
        return extract_text_txt(path)
    elif ext in ['.jpg', '.jpeg', '.png']:
        return extract_text_image(path)
    else:
        raise ValueError(f'Unsupported file format: {ext}')


In [9]:
# SpaCy for entities
nlp = spacy.load("en_core_web_sm")

# KeyBERT for keyphrases
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

# SentenceTransformer for semantic similarity
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

# HuggingFace summarizer (BART large CNN)
summarizer_model = pipeline("summarization", model="facebook/bart-large-cnn")

# Free LLM model from HuggingFace (Mistral 7B Instruct — or similar)
llm_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
llm_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

# LLM pipeline
llm_pipeline = pipeline("text2text-generation", model=llm_model, tokenizer=llm_tokenizer, max_length=1024)


Device set to use mps:0


model.safetensors:  58%|#####8    | 1.82G/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use mps:0


In [11]:
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities


In [13]:
def extract_keyphrases(text, top_n=10):
    keyphrases = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english', top_n=top_n)
    return [phrase for phrase, score in keyphrases]


In [15]:
def extract_meaningful_sections_llm(text):
    prompt = f"Extract the most meaningful sections (main insights, important points) from the following document:\n\n{text}"
    result = llm_pipeline(prompt)
    return result[0]['generated_text']


In [17]:
def generate_summary(text, max_length=130):
    if len(text) < 100:
        return text
    summary = summarizer_model(text, max_length=max_length, min_length=30, do_sample=False)
    return summary[0]['summary_text']


In [19]:
def generate_metadata(text):
    return {
        "title": text.split("\n")[0][:100],
        "keyphrases": extract_keyphrases(text),
        "meaningful_sections": extract_meaningful_sections_llm(text),
        "summary": generate_summary(text),
        "entities": extract_entities(text)
    }


In [21]:
!pip install streamlit

