In [1]:
import faiss  # This will now use the GPU version
from sentence_transformers import SentenceTransformer
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# --- LangChain Imports for RAG ---
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pipe = pipeline("text-generation", model="openai-community/gpt2", device=0, max_new_tokens=150)
llm = HuggingFacePipeline(pipeline=pipe)

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)


In [3]:
loader = TextLoader('./manageMoney.txt',encoding='utf-8')
documents = loader.load()

In [4]:
# Split the documents into smaller, manageable chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [5]:
print("Loading models to GPU... This might take a few minutes the first time.")

# Load the embedding model directly to the GPU
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

# Load the Language Model (LLM) and its tokenizer to the GPU


Loading models to GPU... This might take a few minutes the first time.


In [6]:
llm_model_id = "google/flan-t5-base"


In [7]:
tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
# Load model onto the GPU (device=0)


In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_id).to('cuda')



In [9]:
# Create a Hugging Face pipeline that runs on the GPU
pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    device=0  # device=0 corresponds to the first GPU
)
llm = HuggingFacePipeline(pipeline=pipe)

print("Models loaded successfully! ✅")

Device set to use cuda:0


Models loaded successfully! ✅


In [10]:
all_chunks = []

loader = TextLoader('./manageMoney.txt',encoding='utf-8',)
documents = loader.load()

In [11]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ".", ","],
    chunk_size=1000,
    chunk_overlap=200
)
all_chunks = text_splitter.split_documents(documents)

In [12]:
print(f"Created {len(all_chunks)} text chunks.")


Created 112 text chunks.


In [13]:
print("Creating embeddings for all chunks...")
chunk_texts = [chunk.page_content for chunk in all_chunks]
embeddings = embedding_model.encode(chunk_texts, show_progress_bar=True)
print("Embeddings created successfully.")

Creating embeddings for all chunks...


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.67it/s]

Embeddings created successfully.





In [14]:
# --- Build GPU-ACCELERATED FAISS INDEX ---
print("Building GPU-accelerated FAISS index...")
d = embeddings.shape[1]  # Dimension of embeddings

Building GPU-accelerated FAISS index...


In [15]:
# 1. Create a standard CPU index (this is a placeholder)
index = faiss.IndexFlatL2(d)


In [16]:
# 2. Create a GPU resource object to manage GPU memory
index.add(embeddings)

In [17]:
print(f"CPU index is ready! Indexed {index.ntotal} vectors. ✅")


CPU index is ready! Indexed 112 vectors. ✅


In [18]:
# (Keep your 'retrieve_and_format_docs' function as it is)
def retrieve_and_format_docs(query_text: str, k: int = 4) -> str:
    """Helper function to retrieve docs from FAISS and format them."""
    query_embedding = embedding_model.encode([query_text])
    distances, indices = index.search(query_embedding, k)
    retrieved_docs = [all_chunks[i] for i in indices[0]]
    return "\n\n".join(
        f"Content: {doc.page_content}\nSource: {doc.metadata.get('source', 'N/A')}"
        for doc in retrieved_docs
    )


In [None]:
template = """Answer the question based only on the following context. If you don't know the answer, just say that you don't know.

Context:
{context}

Question: {question}"""

In [None]:
prompt = ChatPromptTemplate.from_template(template)

In [20]:
rag_chain = (
    # This dictionary is the first step. It takes the user's question string
    # and prepares the 'context' and 'question' fields for the prompt.
    {"context": retrieve_and_format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
# Interactive question-answering loop
import time
while True:
    question = input("Ask a question about your documents (or type 'exit' to quit): ")
    if question.lower() == 'exit':
        break
    
    start_time = time.time()
    
    # Invoke the RAG chain
    answer = rag_chain.invoke(question)
    
    end_time = time.time()
    
    print("\n--- Quation ---")
    print(question)
    print("\n--- Answer ---")
    print(answer)
    print(f"Time taken: {end_time - start_time:.2f} seconds")
    print("----------------\n")

Token indices sequence length is longer than the specified maximum sequence length for this model (695 > 512). Running this sequence through the model will result in indexing errors



--- Answer ---
budgeting becomes a tool of empowerment. The mindset shifts from a reactive "I can't spend on this" to a proactive "I am choosing not to spend on this, so I can achieve my goal of that."
Time taken: 14.29 seconds
----------------


--- Answer ---
i don't know
Time taken: 2.11 seconds
----------------


--- Answer ---
a notebook, a spreadsheet, or a dedicated app
Time taken: 2.92 seconds
----------------

