In [2]:
import os
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rank_bm25 import BM25Okapi
from langchain.llms import Ollama

In [8]:
def load_document(file_path):
    if file_path.endswith(".pdf"):
        loader = PyPDFLoader(file_path)
    elif file_path.endswith(".txt"):
        loader = TextLoader(file_path)
    else:
        raise ValueError("Unsupported file type. Use PDF or TXT.")
    return loader.load()
file_path = "genai-principles.pdf" 
documents = load_document(file_path)

In [9]:
documents

[Document(metadata={'producer': 'macOS Version 14.0 (Build 23A344) Quartz PDFContext', 'creator': 'Pages', 'creationdate': '2023-10-25T02:42:03+00:00', 'author': 'Karan Singh', 'moddate': '2023-10-25T09:40:35-04:00', 'title': 'GenAI-Principles', 'source': 'genai-principles.pdf', 'total_pages': 12, 'page': 0, 'page_label': '1'}, page_content='Karan Singh, Assistant Professor of Operations Research \nPrinciples of Generative AI \nA Technical Introduction \nGenerative artificial intelligence (GenAI) tools are an emerging class of new-age artificial \nintelligence algorithms capable of producing novel content — in varied formats such as text, \naudio, video, pictures, and code — based on user prompts. Recent advances in machine \nlearning (ML), massive datasets, and substantial increases in computing power have propelled \nsuch tools to human-level performance on academic and professional benchmarks , 1\ncomparable to the ninetieth percentile on the SAT and the bar exam. \nThis rapid progr

In [10]:
def split_into_chunks(documents, chunk_size=500, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(documents)

In [11]:
chunks = split_into_chunks(documents)

In [12]:
def create_bm25_index(chunks):
    corpus = [chunk.page_content for chunk in chunks]
    tokenized_corpus = [doc.split() for doc in corpus]
    return BM25Okapi(tokenized_corpus), corpus

In [13]:
bm25, corpus = create_bm25_index(chunks)

In [14]:
def bm25_search(query, bm25, corpus, top_n=3):
    tokenized_query = query.split()
    top_docs = bm25.get_top_n(tokenized_query, corpus, n=top_n)
    return top_docs

In [15]:
query = "What is Generative Ai?"
retrieved_docs = bm25_search(query, bm25, corpus)
print("Retrieved Documents\n", retrieved_docs)

Retrieved Documents
 ['Karan Singh, Assistant Professor of Operations Research \nPurpose and Scope  \nWhat are these new-era AI technologies? How do they function? What principles do they \noperate on? What makes them different than already-hyped-up conventional machine learning \n(ML) models? For what tasks is this class of technology most impactful? What future advances \nmight one look forward to? These are the questions this report attempts to shed some light on.', 'Karan Singh, Assistant Professor of Operations Research \nPrinciples of Generative AI \nA Technical Introduction \nGenerative artificial intelligence (GenAI) tools are an emerging class of new-age artificial \nintelligence algorithms capable of producing novel content — in varied formats such as text, \naudio, video, pictures, and code — based on user prompts. Recent advances in machine \nlearning (ML), massive datasets, and substantial increases in computing power have propelled', 'language model, about five days to re

In [16]:
from langchain.llms import Ollama

def generate_response(query, retrieved_docs):
    llm = Ollama(model="mistral")
    context = "\n\n".join(retrieved_docs)
    response = llm.invoke(f"Using this information:\n{context}\nAnswer: {query}")
    return response

response = generate_response(query, retrieved_docs)
print("Response\n", response)

  llm = Ollama(model="mistral")


Response
  Generative Artificial Intelligence (GenAI) is a new class of artificial intelligence algorithms that can create novel content based on user prompts. This content can take various forms, such as text, audio, video, images, and code. The functioning of GenAI relies heavily on recent advancements in machine learning (ML), vast datasets, and increased computing power.

Compared to conventional machine learning models, GenAI is more versatile and dynamic since it generates new content instead of merely recognizing patterns or making predictions based on existing data. Furthermore, these models operate by using statistical patterns within the given dataset to generate new content that resembles but is not exactly the same as the original data points.

GenAI technology is particularly impactful in tasks such as creating personalized content for users, generating high-quality synthetic images or videos for various purposes like gaming, entertainment, and training, writing articles o