## Import necessary packages

In [7]:
import textwrap
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from IPython.display import Markdown, display
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.vectorstores import FAISS
import pandas as pd
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

## Load Model

In [25]:
# Import the Python SDK
import google.generativeai as genai

my_key = 'YOUR_API_KEY'
genai.configure(api_key=my_key)

model = genai.GenerativeModel('gemini-pro')

In [9]:
load_dotenv()
llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key = my_key)

In [26]:
loader = PyPDFLoader(r"ch1.pdf")
pages = loader.load_and_split()
doc = '\n'.join(str(p.page_content) for p in pages[:3])

In [27]:
embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001', google_api_key = my_key)

In [28]:
text_splitter = SemanticChunker(embeddings)

In [29]:
texts = text_splitter.split_text(doc)
len(texts)
texts[0]

'Chapter 1. The Machine Learning\nLandscape\nNot so long ago, if you had picked up your phone and asked it the way\nhome, it would have ignored you—and people would have questioned your\nsanity . But machine learning is no longer science fiction: billions of people\nuse it every day . And the truth is it has actually been around for decades in\nsome specialized applications, such as optical character recognition (OCR). The first ML application that really became mainstream, improving the\nlives of hundreds of millions of people, took over the world back in the\n1990s: the spam filter . It’s not exactly a self-aware robot, but it does\ntechnically qualify as machine learning: it has actually learned so well that\nyou seldom need to flag an email as spam anymore. It was followed by\nhundreds of ML applications that now quietly power hundreds of products\nand features that you use regularly: voice prompts, automatic translation,\nimage search, product recommendations, and many more. Where

In [30]:
# Save DataFrame to csv file
df = pd.DataFrame(texts, columns=['Text'])
df.to_csv('texts.csv', index=False)

In [31]:
df = pd.read_csv('texts.csv')
texts = df['Text'].tolist()

In [32]:
to_markdown(texts[0])

> Chapter 1. The Machine Learning
> Landscape
> Not so long ago, if you had picked up your phone and asked it the way
> home, it would have ignored you—and people would have questioned your
> sanity . But machine learning is no longer science fiction: billions of people
> use it every day . And the truth is it has actually been around for decades in
> some specialized applications, such as optical character recognition (OCR). The first ML application that really became mainstream, improving the
> lives of hundreds of millions of people, took over the world back in the
> 1990s: the spam filter . It’s not exactly a self-aware robot, but it does
> technically qualify as machine learning: it has actually learned so well that
> you seldom need to flag an email as spam anymore. It was followed by
> hundreds of ML applications that now quietly power hundreds of products
> and features that you use regularly: voice prompts, automatic translation,
> image search, product recommendations, and many more. Where does machine learning start and where does it end?

In [33]:
docsearch = FAISS.from_texts(texts, embeddings)

In [34]:
query = 'examples of ML Applications'
print(docsearch.similarity_search(query)[0].page_content)

Chapter 1. The Machine Learning
Landscape
Not so long ago, if you had picked up your phone and asked it the way
home, it would have ignored you—and people would have questioned your
sanity . But machine learning is no longer science fiction: billions of people
use it every day . And the truth is it has actually been around for decades in
some specialized applications, such as optical character recognition (OCR). The first ML application that really became mainstream, improving the
lives of hundreds of millions of people, took over the world back in the
1990s: the spam filter . It’s not exactly a self-aware robot, but it does
technically qualify as machine learning: it has actually learned so well that
you seldom need to flag an email as spam anymore. It was followed by
hundreds of ML applications that now quietly power hundreds of products
and features that you use regularly: voice prompts, automatic translation,
image search, product recommendations, and many more. Where does machine 

## Generate Summary

In [35]:
def create_context(idx):
  if idx == 0:
    return ''
  else:
    past = ''
    for i in range(idx):
      past = past + texts[i]
      prompt = "Provide a detailed summarize to the following text:\n\n\n" + past
      response = model.generate_content(prompt)
      context = """This is a summary of the previous text, use it to help you understand the current text more in order to give better results \n\n\n""" + response.text
    return context

In [36]:
idx = len(texts)
highlighted_pages = []
for i in range(idx):
    prompt_important = create_context(i) + """\n\n Prompt: {Tell me exactly what are the most important 4 to 8 sentences in this text, don't return less than 3 sentences
        don't write anything except for the important sentences please.} \n\n\n""" + texts[i]
    important_sentences = model.generate_content(prompt_important)

    prompt_html = """Can you turn the following text into html code, Use styles and change fonts but never change any sentence or add any sentence,
        use only sentences from the text, please don't change any sentence: """ + texts[i]
    html_response = model.generate_content(prompt_html)

    prompt_highlight = """I have this HTML code:\n""" + html_response.text + """ Highlight the in the html the following sentences and don't change any sentence, just yellow highlight them:\n\n\n
        """ + important_sentences.text

    highlighted_response = model.generate_content(prompt_highlight)
    highlighted_pages.append(highlighted_response.text)


In [37]:
for i in range(len(highlighted_pages)):
    # Specify the file path where you want to save the text file
    file_path = "results/page"+str(i)+".html"
    # Open the file in write mode and write the content of the string to it
    with open(file_path, "w") as file:
        file.write(highlighted_pages[i])
    print("String content saved to:", file_path)


String content saved to: results/page0.html
String content saved to: results/page1.html
String content saved to: results/page2.html
