In [1]:
!pip install pdfminer.six transformers langchain openai faiss-cpu sentence-transformers tiktoken


Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0

In [2]:
import os
from pdfminer.high_level import extract_text
from transformers import pipeline
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.docstore.document import Document
import random


In [3]:
!pip install -U langchain-community


Collecting langchain-community
  Downloading langchain_community-0.3.26-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<1.0.0,>=0.3.66 (from langchain-community)
  Downloading langchain_core-0.3.66-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain<1.0.0,>=0.3.26 (from langchain-community)
  Downloading langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses

In [1]:
!pip install -U langchain




In [3]:
from google.colab import files

def upload_document():
    uploaded = files.upload()
    file_path = next(iter(uploaded))
    return file_path

def extract_text_from_file(file_path):
    if file_path.endswith('.pdf'):
        return extract_text(file_path)
    elif file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    else:
        return ""


In [4]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_text(text):
    chunks = [text[i:i+1024] for i in range(0, len(text), 1024)]
    summaries = summarizer(chunks, max_length=150, min_length=30, do_sample=False)
    return " ".join([summary['summary_text'] for summary in summaries])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [5]:
def create_vector_store(text):
    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = splitter.create_documents([text])
    embeddings = HuggingFaceEmbeddings()
    vector_db = FAISS.from_documents(docs, embeddings)
    return vector_db

def answer_question(vector_db, query):
    retriever = vector_db.as_retriever()
    docs = retriever.get_relevant_documents(query)
    llm = OpenAI(temperature=0)
    chain = load_qa_chain(llm, chain_type="stuff")
    answer = chain.run(input_documents=docs, question=query)
    sources = [doc.page_content[:300] for doc in docs[:1]]
    return answer, sources


In [6]:
def generate_logic_questions(text):
    sentences = [s.strip() for s in text.split('.') if len(s.split()) > 6]
    selected = random.sample(sentences, min(3, len(sentences)))
    qna = []
    for sentence in selected:
        qna.append({
            "question": f"What is the implication of this sentence: \"{sentence}\"?",
            "answer": sentence
        })
    return qna


In [8]:
# Upload file
file_path = upload_document()

# Extract and display
raw_text = extract_text_from_file(file_path)
print("\n✅ File content extracted.\n")

# Auto Summary
print("📌 Document Summary:\n")
print(summarize_text(raw_text))

# Create vector store
vector_db = create_vector_store(raw_text)

# Mode selection
mode = input("\nChoose mode (1 - Ask Anything, 2 - Challenge Me): ")

if mode.strip() == "1":
    while True:
        query = input("\nAsk your question (or type 'exit'): ")
        if query.lower() == "exit":
            break
        answer, sources = answer_question(vector_db, query)
        print("\n🧠 Answer:", answer)
        print("📖 Justification:", sources[0])
elif mode.strip() == "2":
    qna_pairs = generate_logic_questions(raw_text)
    for i, pair in enumerate(qna_pairs):
        print(f"\nQ{i+1}: {pair['question']}")
        user_ans = input("Your Answer: ")
        print("✅ Reference Answer:", pair['answer'])
else:
    print("Invalid mode selected.")


Saving sample_ai_education.txt to sample_ai_education (1).txt

✅ File content extracted.

📌 Document Summary:



Your max_length is set to 150, but your input_length is only 80. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


This paper explores the applications, benefits, challenges, and future prospects of AI in modern education. AI enables personalized learning by analyzing student performance and adapting content accordingly. It also automates administrative tasks, such as attendance and evaluation. Data privacy, high implementation costs, and lack of trained staff are major hurdles in AI adoption in schools. The integration of AI in education holds great promise but requires thoughtful implementation.


  embeddings = HuggingFaceEmbeddings()



Choose mode (1 - Ask Anything, 2 - Challenge Me): 2

Q1: What is the implication of this sentence: "Applications
AI enables personalized learning by analyzing student performance and adapting content accordingly"?
Your Answer: AI can create a custom learning experience for each student based on their performance, helping them learn more effectively.
✅ Reference Answer: Applications
AI enables personalized learning by analyzing student performance and adapting content accordingly

Q2: What is the implication of this sentence: "Title: The Impact of Artificial Intelligence on Modern Education

Abstract:
Artificial Intelligence (AI) has emerged as a powerful tool in transforming education"?
Your Answer: AI is significantly changing traditional education methods by introducing smarter, tech-driven solutions that improve learning outcomes.
✅ Reference Answer: Title: The Impact of Artificial Intelligence on Modern Education

Abstract:
Artificial Intelligence (AI) has emerged as a powerful to