In [1]:
import faiss
import numpy as np
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS

In [3]:
from langchain.document_loaders import PyPDFLoader

In [4]:
def load_document(file_path):
    loader = PyPDFLoader(file_path)
    return loader.load()

In [5]:
file_path = "genai-principles.pdf"
documents = load_document(file_path)
print(f"Loaded {len(documents)} pages from {file_path}")

Loaded 12 pages from genai-principles.pdf


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_into_chunks(docs, chunk_size=500, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(docs)

In [8]:
chunks = split_into_chunks(documents)
print(f"Total Chunks: {len(chunks)}")

Total Chunks: 63


In [10]:
from rank_bm25 import BM25Okapi

corpus = [chunk.page_content for chunk in chunks]
tokenized_corpus = [text.split() for text in corpus]
bm25 = BM25Okapi(tokenized_corpus)
print("BM25 Index Created!")

BM25 Index Created!


In [None]:
import faiss
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS

In [12]:
embedding_model = OllamaEmbeddings(model="mxbai-embed-large")

faiss_index = FAISS.from_documents(chunks, embedding_model)
faiss_index.save_local("faiss_store")
print("FAISS Index Created & Stored Locally")

FAISS Index Created & Stored Locally


In [16]:
def load_faiss():
    return FAISS.load_local("faiss_store", embedding_model, allow_dangerous_deserialization=True)

In [17]:
faiss_index = load_faiss()
print("FAISS Index Loaded Successfully!")

FAISS Index Loaded Successfully!


In [18]:
def hybrid_search(query, bm25_model, faiss_index, corpus, top_n=3):
    bm25_results = bm25.get_top_n(query.split(), corpus, n=top_n)
    faiss_results = faiss_index.similarity_search(query, k=top_n)
    retrieved_docs = bm25_results + [doc.page_content for doc in faiss_results]
    return retrieved_docs

In [20]:
query = "what is generative ai."
retrieved_docs = hybrid_search(query, bm25, faiss_index, corpus)
print("Retrieved Documents\n", retrieved_docs)

Retrieved Documents
 ['While generative AI models come in many different shapes, utilizing varied statistical and \ncomputational techniques to target various modalities, ranging from code and text to audio and \nvideo, this report focuses almost exclusively on large language models (LLMs) capable of \ngenerating novel text from textual prompts. This choice is partly due to the substantial lead \nLLMs have in driving the overall usage of generative AI models  and partly due to the centrality 5', 'supervised learning requires a well-curated dataset that is closely aligned with the \nprediction task at hand. But, as we will see, language models are trained on vast corpora \nof somewhat ruthlessly collected texts from the internet. Yet, completing a random partial \nsentence from the internet is presumably not what businesses using language models \ncare about. \nDeep Learning as Automated Representation Learning', 'Karan Singh, Assistant Professor of Operations Research \nPurpose and Sco

In [21]:
from langchain.llms import Ollama

llm = Ollama(model="mistral")

def generate_response(query, retrieved_docs):
    context = "\n\n".join(retrieved_docs)
    return llm.invoke(f"Context:\n{context}\nQuestion: {query}\nAnswer:")
response = generate_response(query, retrieved_docs)
print("Answern", response)

  llm = Ollama(model="mistral")


Answern  Generative Artificial Intelligence (GenAI) refers to a class of emerging artificial intelligence algorithms that can produce novel content based on user prompts. This content can be in various formats such as text, audio, video, pictures, or code. In the context of this report, the focus is mainly on Large Language Models (LLMs) capable of generating novel text from textual prompts. These models utilize substantial datasets and powerful computational techniques to achieve their tasks.
