## Import necessary packages

In [120]:
import pathlib
import textwrap
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from IPython.display import display
from IPython.display import Markdown
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

## Load Model

In [121]:
load_dotenv()
llm = ChatGoogleGenerativeAI(model="gemini-pro")

In [122]:
loader = PyPDFLoader(r"E:\Courses\projects\Google-AI-Hackathon\ch1.pdf")
pages = loader.load_and_split()
doc = '\n'.join(str(p.page_content) for p in pages[:3])

In [123]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function = len,
)
texts = text_splitter.split_text(doc)
texts[0]

'Chapter 1. The Machine Learning\nLandscape\nNot so long ago, if you had picked up your phone and asked it the way\nhome, it would have ignored you—and people would have questioned your\nsanity . But machine learning is no longer science fiction: billions of people\nuse it every day . And the truth is it has actually been around for decades in\nsome specialized applications, such as optical character recognition (OCR).\nThe first ML application that really became mainstream, improving the\nlives of hundreds of millions of people, took over the world back in the\n1990s: the spam filter . It’s not exactly a self-aware robot, but it does\ntechnically qualify as machine learning: it has actually learned so well that\nyou seldom need to flag an email as spam anymore. It was followed by\nhundreds of ML applications that now quietly power hundreds of products\nand features that you use regularly: voice prompts, automatic translation,\nimage search, product recommendations, and many more.'

In [124]:
embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')

In [125]:
docsearch = FAISS.from_texts(texts, embeddings)

In [126]:
query = 'examples of ML Applications'
print(docsearch.similarity_search(query)[0].page_content)

Chapter 1. The Machine Learning
Landscape
Not so long ago, if you had picked up your phone and asked it the way
home, it would have ignored you—and people would have questioned your
sanity . But machine learning is no longer science fiction: billions of people
use it every day . And the truth is it has actually been around for decades in
some specialized applications, such as optical character recognition (OCR).
The first ML application that really became mainstream, improving the
lives of hundreds of millions of people, took over the world back in the
1990s: the spam filter . It’s not exactly a self-aware robot, but it does
technically qualify as machine learning: it has actually learned so well that
you seldom need to flag an email as spam anymore. It was followed by
hundreds of ML applications that now quietly power hundreds of products
and features that you use regularly: voice prompts, automatic translation,
image search, product recommendations, and many more.


### Next Steps:
1. write prompt template to generate summary for document
2. pass summary as query to docsearch
3. use result of search similarity to be highlighted