In [3]:
import os
from dotenv import load_dotenv

import textwrap
import numpy as np
import pandas as pd

import google.generativeai as genai


from IPython.display import Markdown


# Embedding Example

In [6]:
title = "The next genration of AI for developers and Google Workspace"
Sample_text = ("Title: The next generation of AI for developers and Google Workspace",)
model = 'models/embedding-001'
embedding = genai.embed_content(model=model,
                                content=Sample_text,
                                task_type="retrieval_document",
                                title=title)
print(embedding)
print(len(embedding['embedding'][0]))

{'embedding': [[0.042077992, -0.019561278, -0.023592692, -0.040348373, 0.07969901, 0.043683343, 0.00528953, 0.030831655, 0.031621926, 0.08120193, -0.005643414, 0.01766146, -0.058881667, -0.001357631, 0.039107047, -0.032413244, 0.028788012, 0.011042602, -0.040666655, 0.031072233, 0.066110104, -0.008326009, -0.021814477, -0.05020413, -0.028840445, -0.03405121, 0.005174113, -0.009935476, 0.022726668, 0.009107596, -0.049797516, 0.012992686, -0.039540857, 0.02242963, -0.024084723, -0.029931977, -0.01600273, -0.02352084, 0.017464988, -0.0020999317, -0.019449204, -0.07947758, -0.016422233, 0.04702703, 0.044103574, -0.025454935, 0.029373871, 0.023687422, 0.016594011, -0.052016895, 0.03853344, 0.04166182, 0.07072213, -0.018024273, -0.009445107, 0.007088879, 0.018594094, -0.0264267, 0.0034890422, 0.0020487898, -0.0043285703, 0.0225305, -0.024251333, 0.021448512, -0.00617759, -0.044219963, -0.041450296, 0.0020235984, 0.03386156, 0.046009243, 0.027184604, 0.0017384415, 0.019410675, -0.056208737, -

Extraction of text from PDF file

In [7]:
from pdfminer.high_level import extract_text

text = extract_text("Tax Rebate Certificate.pdf")
# split for every 5 sentences
sentences = text.split(".")
sentences=["".join(sentences[i:i+10]).replace('\n','').strip() for i in range(0, len(sentences),10)]
for i in sentences:
    print(i)

Avyansh Tivedi(2023-2034)[202316] K-GARTEN PRE-SCHOOL Pathak Puram colony, Raebareli Road Lucknow, UP(226025)                 Ph No: 8400054584                                    Email ID: kgarten072023@gmailcom                                                              Date:               This is to certify that AVYANSH TRIVEDI S/0 NISHIKANT TRIVEDI is a bonafide student of this institution in Class – Nursery in Session 202-24   Instalments  Tuition Fee - April Tuition Fee – May Tuition Fee – June Tuition Fee – July Tuition Fee – August Tuition Fee – September Tuition Fee – October Tuition Fee – November Tuition Fee – December Tuition Fee – January Tuition Fee – February Tuition Fee - March S No 1 2 3 4 5 6
7 8 9 10 11 12 Admission Fee Received 2000 2000 2000 2000 2000 2000 2000 2000 2000 5000 Total:   23,000 Amount of Rupees 23,000 has been paid by Nishikant Trivedi for the period of July to February  In this session as Tuition fees Principal K-Garten Pre- School


In [8]:
documents = []
for idx, i in enumerate(sentences):
    documents.append({
        'title':f"Document {idx}",
        'content': i
    })
documents

[{'title': 'Document 0',
  'content': 'Avyansh Tivedi(2023-2034)[202316] K-GARTEN PRE-SCHOOL Pathak Puram colony, Raebareli Road Lucknow, UP(226025)                 Ph No: 8400054584                                    Email ID: kgarten072023@gmailcom                                                              Date:               This is to certify that AVYANSH TRIVEDI S/0 NISHIKANT TRIVEDI is a bonafide student of this institution in Class – Nursery in Session 202-24   Instalments  Tuition Fee - April Tuition Fee – May Tuition Fee – June Tuition Fee – July Tuition Fee – August Tuition Fee – September Tuition Fee – October Tuition Fee – November Tuition Fee – December Tuition Fee – January Tuition Fee – February Tuition Fee - March S No 1 2 3 4 5 6'},
 {'title': 'Document 1',
  'content': '7 8 9 10 11 12 Admission Fee Received 2000 2000 2000 2000 2000 2000 2000 2000 2000 5000 Total:   23,000 Amount of Rupees 23,000 has been paid by Nishikant Trivedi for the period of July to February  

In [10]:
df = pd.DataFrame(documents)
df.columns = ['Title','Text']
df

Unnamed: 0,Title,Text
0,Document 0,Avyansh Tivedi(2023-2034)[202316] K-GARTEN PRE...
1,Document 1,7 8 9 10 11 12 Admission Fee Received 2000 200...


In [12]:
# Get the embedding of each text and add to an embeddings columns in the dataframe
def embed_fn(title, text):
    return genai.embed_content(
        model = model,
        content = text,
        task_type="retrieval_document",
        title=title
    )["embedding"]

df['Embeddings'] = df.apply(lambda row: embed_fn(row['Title'],row['Text']),axis=1)
df

Unnamed: 0,Title,Text,Embeddings
0,Document 0,Avyansh Tivedi(2023-2034)[202316] K-GARTEN PRE...,"[-0.00854073, -0.006398982, -0.029598754, 0.00..."
1,Document 1,7 8 9 10 11 12 Admission Fee Received 2000 200...,"[0.013075892, -0.005684731, 0.0009316697, 0.00..."


# Question and its embedding

In [None]:
query = "Responsibility at microsoft"
model ='models/embedding-001'

request = genai.embed_content(
    model=model,
    content=query,
    task_type='retrieval_query'
)
print(request)

In [14]:
def find_best_passage(query, dataframe):
    """
    compute the distances between the query and each  document in the datframe using the dot product.
    """
    query_embedding = genai.embed_content(model=model,
                                          content=query,
                                          task_type="retrieval_query")
    dot_products = np.dot(np.stack(dataframe['Embeddings']),query_embedding["embedding"])
    idx = np.argmax(dot_products)
    return dataframe.iloc[idx]['Text'] # Return text from index with max value


In [16]:
passage = find_best_passage(query,df)
passage

'7 8 9 10 11 12 Admission Fee Received 2000 2000 2000 2000 2000 2000 2000 2000 2000 5000 Total:   23,000 Amount of Rupees 23,000 has been paid by Nishikant Trivedi for the period of July to February  In this session as Tuition fees Principal K-Garten Pre- School'

In [17]:
def make_prompt(query, relevant_passage):
    escaped = relevant_passage.replace("'","").replace('"',"").replace("\n"," ")
    prompt = textwrap.dedent("""You are helpful and informative bot that answers\
                             questions using text from the reference passage included below.\
                             Be sure to respond in a complete sentence, being comprehensive,\
                             including all relevant background information.\
                             However, you are talking to a non-technical audience, \
                             so be sure to break down complicated concepts and \
                             strike a friendly and converstional tone.\
                             If the passage is irrelevant to the answer, you may ignore it.
                             QUESTION: '{query}'
                             PASSAGE: '{relevant_passage}'
                             ANSWER:
                             """).format(query=query, relevant_passage=escaped)
    
    return prompt

In [18]:
prompt = make_prompt(query, passage)
print(prompt)

You are helpful and informative bot that answers                             questions using text from the reference passage included below.                             Be sure to respond in a complete sentence, being comprehensive,                             including all relevant background information.                             However, you are talking to a non-technical audience,                              so be sure to break down complicated concepts and                              strike a friendly and converstional tone.                             If the passage is irrelevant to the answer, you may ignore ir.
                             QUESTION: 'Responsibility at microsoft'
                             PASSAGE: '7 8 9 10 11 12 Admission Fee Received 2000 2000 2000 2000 2000 2000 2000 2000 2000 5000 Total:   23,000 Amount of Rupees 23,000 has been paid by Nishikant Trivedi for the period of July to February  In this session as Tuition fees Principal K-Garten Pre- School

In [19]:
model = genai.GenerativeModel('models/gemini-1.5-flash-latest')
answer = model.generate_content(prompt)
Markdown(answer.text)

The provided passage doesn't contain any information about responsibility at Microsoft. It appears to be a financial record of tuition fees paid at a school. 
