In [1]:
import os
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig

In [2]:
import wikipedia

page_title = "Artificial Intelligence"

content = wikipedia.page(page_title).content

# Save the content to a text file
with open("sample.txt", "w", encoding="utf-8") as file:
    file.write(content)

print("Wikipedia page saved as sample.txt")

Wikipedia page saved as sample.txt


In [3]:
# Load and split document
def load_and_split_document(file_path):
    loader = TextLoader(file_path)
    docs = loader.load()

    # Split documents into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    split_docs = text_splitter.split_documents(docs)
    return split_docs

In [4]:
# Convert text into embeddings and store in FAISS
def create_faiss_index(docs):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    faiss_index = FAISS.from_documents(docs, embeddings)
    faiss_index.save_local("faiss_index")  # Save for later use
    return faiss_index

In [5]:
# Load LLaMA-2 Model
def load_llm_model():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float,
        bnb_4bit_use_double_quant=True
    )

    model_name = "tiiuae/falcon-7b-instruct"

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=200,
        temperature=0.7
    )

    llm = HuggingFacePipeline(pipeline=pipe)
    return llm, tokenizer

In [6]:
# Create RAG pipeline
def create_rag_pipeline(retriever, llm):
    system_prompt = (
        "Use the given context to answer the question. "
        "If you don't know the answer, say you don't know. "
        "Use three sentences maximum and keep the answer concise. "
        "Context: {context}"
    )

    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", "{input}"),
    ])

    question_answer_chain = create_stuff_documents_chain(llm, prompt)
    chain = create_retrieval_chain(retriever, question_answer_chain)
    return chain


In [13]:
# Initialize components
docs = load_and_split_document("sample.txt")
llm, tokenizer = load_llm_model()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [14]:
faiss_index = create_faiss_index(docs)
retriever = faiss_index.as_retriever()

In [15]:
rag_chain = create_rag_pipeline(retriever, llm)

In [16]:
user_query="What is Artificial Intelligence?"

response = rag_chain.invoke({"input": user_query})

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


In [17]:
response

{'input': 'What is Artificial Intelligence?',
 'context': [Document(id='64f6ab07-3718-4f6b-b961-7adb1c435887', metadata={'source': 'sample.txt'}, page_content='Artificial intelligence (AI), in its broadest sense, is intelligence exhibited by machines, particularly computer systems. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals. Such machines may be called AIs.'),
  Document(id='fdd10f74-7d1d-4978-b87c-0f338420407e', metadata={'source': 'sample.txt'}, page_content='rarely described as "artificial intelligence" (a tendency known as the AI effect).'),
  Document(id='9e338950-484f-49cc-bb53-413218d403f9', metadata={'source': 'sample.txt'}, page_content='Artificial intelligent (AI) agents are software entities designed to perceive their environment, make decisions, and take actions autonomou