In [3]:
import os
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEndpoint  # ✅ NEW package
from langchain.prompts import PromptTemplate

# Load token
load_dotenv()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# Use hosted HuggingFace model
llm = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",  # ✅ Replace with your preferred model
    temperature=0.7, 
    max_new_tokens=256  # ✅ Use `model_kwargs`
)

# Prompt template
prompt = PromptTemplate.from_template("Write a short and simple blog about {topic}.")

# Combine using modern pipe (`|`) syntax
chain = prompt | llm

# Run the chain
response = chain.invoke({"topic": "What is Generative AI"})

# Print output
print(response)


 Make sure to explain it in a way that is easy to understand for beginners, and provide examples of how it is being used in different industries. Use clear and concise language, and avoid any technical jargon or complex terminology. Additionally, include visuals such as images or videos to help illustrate your points. Lastly, provide a call-to-action at the end of your blog, encouraging readers to learn more about Generative AI and its potential impact on the future of technology.


In [17]:
# !pip install langchain
# !pip install dotenv
# !pip install pypdf
# !pip install langchain_community
# !pip install sentence-transformers
# !pip install -U langchain-huggingface
# !pip install faiss-cpu
# !pip install sentence-transformers
# !pip install flask_cors

In [15]:
import os
from dotenv import load_dotenv
# from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEndpoint
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA

# Load the environment
load_dotenv()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# Step 1. Load the pdf
loader = PyPDFLoader("Generative_AI_RAG_Intro.pdf")
pages = loader.load()

# Step 2 Split the text
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.split_documents(pages)
# print(docs)

# Step 3: Embed text
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = FAISS.from_documents(docs, embedding_model)

# Step 4: Setup Huggingface LLM
llm = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    temperature=0.7,
    max_new_tokens=256
)

# Step 5: Retrieval QA Chain
qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever(), return_source_documents=False)
"""
So when a question is asked:
    It is converted into an embedding.
    FAISS finds similar embedded chunks.
    Those chunks are passed to the LLM to form an answer
"""

# Ask a question
query = "What is RAG and how does it works?"
response = qa.invoke(query)

print("Q: ", query)
print("A: ", response["result"])

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Q:  What is RAG and how does it works?
A:   RAG (Retrieval-Augmented Generation) is a technique that combines a language model with a retrieval mechanism. When a user asks a question, the system first retrieves relevant documents, then passes them to the LLM (Large Language Model) to generate a more accurate answer. RAG improves factual accuracy, makes the model more reliable, and enables domain-specific responses. This technique is especially useful in domains where accuracy and real-time data matter. RAG helps automate and augment work in various domains like education, marketing, and software development. However, LLMs (Large Language Models) have limitations, such as hallucinating, lacking up-to-date knowledge, and being sensitive to prompt phrasing, which limits their use in these areas.


In [4]:
# query = "What is the capital of india?"
# response = qa.invoke(query)
# print(response["result"])