In [None]:
# STEP 1: Install required packages
!pip install -q faiss-cpu sentence-transformers transformers langchain pypdf accelerate

# STEP 2: Import Libraries
import os
from pathlib import Path
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from google.colab import files



In [None]:
# STEP 3: Upload multiple files (.pdf or .txt)
uploaded = files.upload()
upload_path = "/content/uploads"
os.makedirs(upload_path, exist_ok=True)

for name, data in uploaded.items():
    with open(os.path.join(upload_path, name), "wb") as f:
        f.write(data)

# STEP 4: Load and parse the documents
docs = []
for file_path in Path(upload_path).glob("*"):
    if file_path.suffix.lower() == ".pdf":
        loader = PyPDFLoader(str(file_path))
    elif file_path.suffix.lower() == ".txt":
        loader = TextLoader(str(file_path))
    else:
        continue
    docs.extend(loader.load())

# STEP 5: Split the documents into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = splitter.split_documents(docs)

# STEP 6: Create embeddings using a free model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(texts, embedding_model)



Saving Motor-vehicle-value-articles-Late-2021.pdf to Motor-vehicle-value-articles-Late-2021.pdf


In [None]:
# STEP 7: Load small, fast Hugging Face LLM (no token required)
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Create pipeline
text_gen_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024, temperature=0.7, do_sample=True)
llm = HuggingFacePipeline(pipeline=text_gen_pipeline)

# STEP 8: Create Retrieval QA Chain
qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectorstore.as_retriever(), return_source_documents=True)



Device set to use cpu


In [None]:
import textwrap
from langchain.docstore.document import Document

# STEP 9: Ask a question
query = input("Enter your question: ")
result = qa(qaery)

# Format and wrap the answer nicely (e.g., 100 characters per line)
wrapped_answer = textwrap.fill(result["result"].strip(), width=100)

# Print the formatted result
print("\n📘 Answer:\n")
print(wrapped_answer)

# Print document sources (optional)
print("\n📎 Sources:")
for doc in result["source_documents"]:
    if isinstance(doc, Document) and doc.metadata.get("source"):
        print(f"- {os.path.basename(doc.metadata['source'])}")


Enter your question: What are the main topics covered in these documents?

📘 Answer:

used truck pricing in 2022 - and volume positive into 2023 - despite all the current and impending
changes to Fed activity, interest rates, COVID stimulus payments, and other variables. Our
forecasting models incorporate these rosy freight rate outlooks, with a cautious eye on what will
happen to consumer purchasing once these economic changes become reality. Basically, we’re focused
on the impact to demand once interest rates rise and stimulus payments stop.

📎 Sources:
- Motor-vehicle-value-articles-Late-2021.pdf
- Motor-vehicle-value-articles-Late-2021 (1).pdf
- Motor-vehicle-value-articles-Late-2021.pdf
- Motor-vehicle-value-articles-Late-2021 (1).pdf


In [None]:
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
import textwrap

# Custom strict prompt
prompt_template = """
You are a helpful assistant that only answers questions based on the provided context.
If the answer cannot be found in the context, respond with "I'm not sure based on the documents."

Context:
{context}

Question:
{question}

Answer only from the context above. If not available, say you don't know.
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template
)

# Filter retriever to skip irrelevant matches
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# Wrap with Conversational RAG chain
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    combine_docs_chain_kwargs={"prompt": prompt}
)

chat_history = []
print("🧠 Stricter RAG Chat Ready! Type 'exit' to end the chat.")

while True:
    query = input("\n🧠 You: ")

    if query.lower() in ["exit", "quit", "q"]:
        print("👋 Chat ended.")
        break

    result = qa_chain({"question": query, "chat_history": chat_history})
    answer = result["answer"].strip()

    # Print result nicely
    print("\n🤖 Assistant:\n")
    print(textwrap.fill(answer, width=100))

    # Print sources used
    source_files = set()
    for doc in result["source_documents"]:
        if isinstance(doc, Document) and doc.metadata.get("source"):
            source_files.add(os.path.basename(doc.metadata["source"]))

    if source_files:
        print("\n📎 Sources:")
        for filename in sorted(source_files):
            print(f"- {filename}")

    chat_history.append((query, answer))


🧠 Stricter RAG Chat Ready! Type 'exit' to end the chat.

🧠 You: According to the document, what happened when a woman visited Jeff Schrier’s used car lot?

🤖 Assistant:

I'm not sure based on the documents.

📎 Sources:
- Motor-vehicle-value-articles-Late-2021 (1).pdf
- Motor-vehicle-value-articles-Late-2021.pdf

🧠 You: what is the document about?

🤖 Assistant:

J.D. Power Valuation Services (formerly NADA Used Car Guide) J.D. Power Valuation Services, formerly
NADA Used Car Guide, is a leading provider of vehicle valuation products, services and information
to businesses. Its team collects and analyzes over 1 million combined automotive and truck wholesale
and retail transactions per month, and delivers a range of guidebooks, auction data, analysis and
data solutions. J.D. Power acquired NADA Used Car

📎 Sources:
- Motor-vehicle-value-articles-Late-2021 (1).pdf
- Motor-vehicle-value-articles-Late-2021.pdf

🧠 You: according to the document, what does the page 5 talk about?

🤖 Assistant: