In [13]:
!pip install -q transformers torch sentencepiece PyPDF2


In [14]:
import io
from PyPDF2 import PdfReader
from transformers import pipeline


In [15]:
# PDF to text
def pdf_to_text(file_bytes):
    reader = PdfReader(io.BytesIO(file_bytes))
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

# Chunk text
def chunk_text(text, max_words=400, overlap=60):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = min(start + max_words, len(words))
        chunks.append(" ".join(words[start:end]))
        if end == len(words):
            break
        start = end - overlap
    return chunks

# Summarization
def summarize_text(text, summarizer, chunk_size=400, overlap=60):
    chunks = chunk_text(text, chunk_size, overlap)
    summaries = []
    for ch in chunks:
        try:
            out = summarizer(ch, max_length=120, min_length=30, do_sample=False)
            summaries.append(out[0]["summary_text"])
        except:
            summaries.append(ch[:500])
    combined = " ".join(summaries)
    try:
        final = summarizer(combined, max_length=150, min_length=40, do_sample=False)
        return final[0]["summary_text"]
    except:
        return combined

# Question answering
def qa_over_chunks(question, text, qa_pipeline, chunk_size=300, overlap=50):
    chunks = chunk_text(text, chunk_size, overlap)
    best = {"score":0, "answer":""}
    for ch in chunks:
        try:
            res = qa_pipeline(question=question, context=ch)
            if res["score"] > best["score"]:
                best = {"score": res["score"], "answer": res["answer"], "context": ch}
        except:
            continue
    return best


In [16]:
print("Loading models... (may take 1-2 minutes first time)")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
print("Models loaded!")


Loading models... (may take 1-2 minutes first time)


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Device set to use cpu


Models loaded!


In [19]:
from google.colab import files

print("Upload a PDF/TXT file or paste text below.")

uploaded_files = files.upload()
input_text = ""

for filename in uploaded_files:
    if filename.lower().endswith(".pdf"):
        input_text += pdf_to_text(uploaded_files[filename])
    else:
        input_text += uploaded_files[filename].decode("utf-8")

if not input_text.strip():
    input_text = input("Or paste text here:\n")

# Summarize
print("\n=== Generating Summary ===")
summary = summarize_text(input_text, summarizer)
print("------------------------")
print(summary)





Upload a PDF/TXT file or paste text below.


Saving Cloud_computing.pdf to Cloud_computing.pdf

=== Generating Summary ===
------------------------
 Cloud computing is delivery of computing services over the internet ("the cloud") It allows users to access resources and applications on demand, typically on a pay- per-use basis . The main types of cloud services are Infrastructure as a Service (IaaS), Platform as a . Service (PaaS) provides virtualized computing resources, such as virtual machines, storage, and networks . The cloud architecture is divided into 2 parts, i.e. Frontend and Backend .
