In [1]:
import fitz  # PyMuPDF
import pandas as pd
import google.generativeai as genai
import google.ai.generativelanguage as glm
import API_KEY
import textwrap
import numpy as np
import time

In [2]:
# Open the PDF file
pdf_path = '175_choice_recipes_mainly_furnished_by_members_of_the_chicago_womens_club-1887.pdf'
doc = fitz.open(pdf_path)

# Initialize an empty list to store page text
data = []

# Extract text from each page
for page_num in range(len(doc)):
    page = doc.load_page(page_num)
    text = page.get_text()
    data.append({"Page": page_num + 1, "Text": text})

# Convert the list to a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df.head())  # Show the first few rows to verify

   Page                                               Text
0     1                                              ^'^\n
1     2                                                   
2     3  175\nChoice Recipes\nMAINLY FURNISHED\nBY MEMB...
3     4  y^-k^\nCopyrighted by\nCHARLES H. KERR & COMPA...
4     5  CONTENTS.\nPAGE\nBrown Bread\n5\nBreakfast Dis...


In [3]:
df.iloc[79]

Page                                                   80
Text    KINDLINGmMAPLE.\nThe Provident Wood Yard\nof t...
Name: 79, dtype: object

In [4]:
# removing irrelvant pages
df_clean = df.iloc[6:79]
df_clean

Unnamed: 0,Page,Text
6,7,175 CHOICE RECIPES.\nBROWN BREAD.\nSteamed Bro...
7,8,6\n175 CHOICE RECIPES.\nSteamed Brown Bread.\n...
8,9,175 CHOICE RECIPES.\nBREAKFAST\nDISHES.\nCream...
9,10,175 CHOICE RECIPES.\nPop-Overs.\n4 cups of flo...
10,11,175 CHOICE RECIPES.\n9\n6 ounces of grated rye...
...,...,...
74,75,175 (^HOTCE RECIPES.\n7S\nSpiced Grapes.\n7 po...
75,76,"74\n175 CHOICE RECIPES,\nMustard Pickles.\n2 q..."
76,77,175 CHOICE RECIPES.\n75\nJuniper Pickle.\nSoak...
77,78,76\n175 CHOICE RECIPES.\nBEVERAGES.\nCream Nec...


In [5]:
genai.configure(api_key=API_KEY.api_key().GOOGLE_API_KEY)

In [6]:
for m in genai.list_models():
  if 'embedContent' in m.supported_generation_methods:
    print(m.name)

models/embedding-001


In [7]:
def get_embeddings(text):
    return genai.embed_content(model='models/embedding-001',content=text,task_type="RETRIEVAL_QUERY")['embedding']

In [8]:
sample_text = df_clean.iloc[0,1]
print(sample_text)

175 CHOICE RECIPES.
BROWN BREAD.
Steamed Brown Bread.
2
full cups of Indian meal.
3 level cups of rye meal.
I cup of molasses.
I teaspoonful of soda, dissolved in a very little boiling
water.
I quart of milk.
Salt.
Steam four hours.
Mrs. J.
P. Odell.
Graham Bread.
I quart of sour milk.
Soda enough to make
it foam.
I tablespoonful of melted butter.
Salt to taste.
Stir
in graham
flour enough
to make
a thick paste.
This bread
is good
as an
occasional and quickly-baked
loaf.
Flokenck
R. Hartlett.



In [9]:
print(get_embeddings(sample_text))

[-0.02025935, -0.032381907, -0.057473756, -0.029505618, 0.02799025, 0.040712018, -0.0052619847, -0.032409117, 0.026341295, 0.024990464, 0.07154737, 0.0019295504, -0.016857728, -0.0042297677, -0.008394372, 0.008195285, 0.016911259, 0.03738446, -0.017118836, -0.06295037, 0.0106583545, -0.013271434, -0.02340823, -0.02072605, 0.024110608, -0.0027711052, 0.03473185, -0.06739694, -0.04023975, 0.013984203, -0.03605295, 0.014872252, -0.034574084, 0.0077950596, -0.04884154, -0.035041396, -0.008565796, 0.03772345, 0.0034543674, 0.1062395, 0.019663414, 0.020263962, -0.0323412, -0.0108621735, 0.029143639, -0.051025074, -0.019989027, 0.034401268, 0.0037355176, -0.06402009, -0.020602796, -0.013328896, 0.019463617, -0.02012168, 0.0018932369, -0.05494664, 0.059008643, 0.004491697, -0.00690153, 0.026107453, -0.028358513, 0.04373793, 0.033090327, 0.032528207, -0.035760053, -0.0779643, -0.10465884, 0.017219089, 0.108090416, 0.015638666, -0.010602688, -0.064414166, 0.061994605, 0.020174522, -0.030958228, 

In [10]:
# cleaning text
cleaned_txt = []
for i in range(0,df_clean.shape[0]):
    gen_model = genai.GenerativeModel('models/gemini-pro')
    prompt = '''
    The data given to you is old recipes.
    I am planning to clean this text for using as input into LLM. Assume you are an excellent data cleaner and formatter.
    If you find any special characters that doesnt represent text please remove it.
    Please fill any text that you think is missing or incorrect or mispelled etc., and add or modify it.
    But I want you to keep the text I provide as much as possible and format it with markdown so that its easy to make embeddings.
    The text is: 
    ''' + df_clean.iloc[i,1]
    answer = gen_model.generate_content(prompt)
    try:
        cleaned_txt.append(answer.text)
        print("Completed:",i)
        time.sleep(0.8)
    except:
        print("Missed:",i)
        cleaned_txt.append(df_clean.iloc[i,1])

Completed: 0
Completed: 1
Completed: 2
Completed: 3
Completed: 4
Completed: 5
Completed: 6
Completed: 7
Completed: 8
Completed: 9
Completed: 10
Completed: 11
Completed: 12
Completed: 13
Missed: 14
Completed: 15
Completed: 16
Completed: 17
Completed: 18
Completed: 19
Completed: 20
Completed: 21
Completed: 22
Completed: 23
Completed: 24
Completed: 25
Completed: 26
Completed: 27
Completed: 28
Missed: 29
Completed: 30
Completed: 31
Completed: 32
Completed: 33
Completed: 34
Completed: 35
Completed: 36
Completed: 37
Completed: 38
Completed: 39
Completed: 40
Completed: 41
Completed: 42
Completed: 43
Completed: 44
Missed: 45
Completed: 46
Completed: 47
Completed: 48
Missed: 49
Completed: 50
Completed: 51
Completed: 52
Completed: 53
Completed: 54
Completed: 55
Completed: 56
Completed: 57
Completed: 58
Completed: 59
Completed: 60
Completed: 61
Completed: 62
Completed: 63
Completed: 64
Completed: 65
Completed: 66
Completed: 67
Completed: 68
Completed: 69
Completed: 70
Completed: 71
Missed: 72


In [11]:
df_clean['cleaned_text'] = np.array(cleaned_txt)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['cleaned_text'] = np.array(cleaned_txt)


In [12]:
df_clean

Unnamed: 0,Page,Text,cleaned_text
6,7,175 CHOICE RECIPES.\nBROWN BREAD.\nSteamed Bro...,## 175 CHOICE RECIPES\n\n### Brown Bread\n\n**...
7,8,6\n175 CHOICE RECIPES.\nSteamed Brown Bread.\n...,**Steamed Brown Bread**\n\n- 1 cup flour\n- 1/...
8,9,175 CHOICE RECIPES.\nBREAKFAST\nDISHES.\nCream...,## Breakfast Dishes\n\n### Cream Johnnycake\n\...
9,10,175 CHOICE RECIPES.\nPop-Overs.\n4 cups of flo...,## 175 CHOICE RECIPES\n\n### Pop-Overs\n\n- 4 ...
10,11,175 CHOICE RECIPES.\n9\n6 ounces of grated rye...,# 175 CHOICE RECIPES\n\n## 6 ounces of grated ...
...,...,...,...
74,75,175 (^HOTCE RECIPES.\n7S\nSpiced Grapes.\n7 po...,**Spiced Grapes**\n\n* 7 pounds grapes\n* 1 pi...
75,76,"74\n175 CHOICE RECIPES,\nMustard Pickles.\n2 q...",### Mustard Pickles\n\n**Ingredients:**\n- 2 q...
76,77,175 CHOICE RECIPES.\n75\nJuniper Pickle.\nSoak...,## Juniper Pickle\n\n**Ingredients:**\n\n- 300...
77,78,76\n175 CHOICE RECIPES.\nBEVERAGES.\nCream Nec...,**Beverages**\n\n**Cream Nectar**\n\n- 2 ounce...


In [13]:
pd.DataFrame(df_clean['cleaned_text'].str.len()).describe()

Unnamed: 0,cleaned_text
count,73.0
mean,1155.794521
std,383.043548
min,9.0
25%,980.0
50%,1165.0
75%,1329.0
max,3014.0


In [16]:
df_clean.to_json("cleaned_txt.json")

In [14]:
whole_txt = ''
for i in range(0,df_clean.shape[0]):
    whole_txt = whole_txt + df_clean.iloc[i,2]

In [17]:
# embedding Database logic
# The logic is based on a rolling window with a width of batch_size.
# The step size of rolling is with a step_factor that is step_size = batch_size / step_factor
# Combination of both step and batch shall constitute rolling.
embeddings_db = []
total_chars = len(whole_txt)
batch_size = 1000
step_factor = 8
step_size = batch_size // step_factor
start_pos = 0
while start_pos < total_chars:
    end_pos = start_pos + batch_size
    end_pos = min(end_pos,total_chars)
    text = whole_txt[start_pos:end_pos]
    start_pos = start_pos + step_size
    gen_model = genai.GenerativeModel('models/gemini-pro')
    prompt = '''
    The text provides contains recipes.
    Please provide the a 2 sentence summarization of this text.
    Also list out only the recipe names you find in this text.
    The whole output should not be more than 5 sentences.
    The text is: 
    ''' + text
    answer = gen_model.generate_content(prompt)
    try:
        embeddings_db.append([answer.text,text,get_embeddings(answer.text + " " + text)])
        print(np.round(start_pos / total_chars * 100,3),'%',end="\r")
        time.sleep(1.0)
    except:
        print("Missed:",start_pos)
        print(np.round(start_pos / total_chars * 100,3),'%')
        embeddings_db.append(['',text,get_embeddings(text)])


Missed: 8875
Missed: 21750
100.002 %

In [18]:
embeddings_df = pd.DataFrame(embeddings_db,columns = ['summ_txt','text','embeddings'])
embeddings_df

Unnamed: 0,summ_txt,text,embeddings
0,This text provides 4 recipes: Steamed Brown Br...,## 175 CHOICE RECIPES\n\n### Brown Bread\n\n**...,"[-0.033366084, -0.048817173, -0.05177034, -0.0..."
1,**Summary:** This text provides recipes for th...,cup of molasses\n- 1 teaspoonful of baking sod...,"[-0.026710054, -0.042288337, -0.040157776, -0...."
2,"This text contains recipes for Graham Bread, S...",four hours.\n\n- Mrs. J. P. Odell\n\n**Graham ...,"[-0.025552925, -0.029467182, -0.03180367, -0.0..."
3,**Summary:**\nThis text provides two bread rec...,melted butter\n- Salt to taste\n\nStir in grah...,"[-0.016907776, -0.036389567, -0.04036849, -0.0..."
4,**Summary:**\nThis text provides two recipes f...,ckly-baked loaf.\n\n- Flokenck R. Hartlett**St...,"[-0.029654216, -0.0408084, -0.05168476, -0.025..."
...,...,...,...
670,This text provides a recipe for healthy living...,"fee are\ndiscarded from my\n' bill\nof\nfare,'...","[-0.011222291, -0.040570628, -0.007511709, -0...."
671,**Summary:**\nThis text does not contain any r...,".\nEntire wheat\nflour\nbread, vegetables,\nfr...","[-0.012315712, -0.018118693, 0.0041092755, -0...."
672,"The text contains a recipe for ""Plain living a...","embic\nof\nthe\ndigestive organs\ninto pure,\n...","[-0.01643079, -0.034384463, -0.017329212, -0.0..."
673,The provided text does not contain any recipes...,thoughts\nafter Him\n'\nas\nthey have never y...,"[-0.003992342, -0.026472228, -0.0055747554, -0..."


In [19]:
embeddings_df.to_json('book_embeddings.json')

In [20]:
stop here

SyntaxError: invalid syntax (4067800170.py, line 1)

In [21]:
def do_dot(x,y):
    return np.dot(x,y)

In [24]:
def get_best_match(text,best_num):
    embedding_Q = genai.embed_content(model='models/embedding-001',
                                content=text,
                                task_type="RETRIEVAL_QUERY")['embedding']
    e_df = embeddings_df.copy()
    e_df['score'] = embeddings_df.apply(lambda x:do_dot(x['embeddings'],embedding_Q),axis=1)
    e_df.sort_values(by=['score'],ascending=False,inplace=True)
    relevant_text = ''
    for i in range(0,best_num):
        relevant_text = relevant_text + str(e_df.iloc[i]['text'] + e_df.iloc[i]['summ_txt'])
    return relevant_text

In [26]:
print(get_best_match('cake',3))

baking powder

**Instructions:**

1. Preheat oven to 350°F (175°C).
2. Grease and flour two 9-inch round cake pans.
3. In a large bowl, beat the egg whites until stiff peaks form. Gradually add the granulated sugar, beating until the mixture is glossy and stiff.
4. In a separate bowl, cream together the butter and sugar until light and fluffy.
5. Add the egg yolks one at a time, beating well after each addition.
6. Alternately add the dry ingredients and the milk to the batter, beginning and ending with the dry ingredients.
7. Fold in the stiffly beaten egg whites.
8. Divide the batter between the prepared cake pans and bake for 30-35 minutes, or until a toothpick inserted into the center comes out clean.
9. Let cool completely in the pans before assembling the cake.

**Filling:**

- 1 cup of granulated sugar
- 2 tablespoonfuls of water

**Instructions:**

1. In a small saucepan, combine the sugar and water.
2. Bring the mixture to a boil over medium heat and cook for 2-3 minutes, or u

In [50]:
def make_prompt(query, relevant_text):
  prompt = '''
    You are an informative bot and please use the relevant text provided and answer the question.
    Please provide only the recipe. The question is ''' + query + ''' The relevant text is ''' + relevant_text
  return prompt

In [51]:
query = "i want recipe of pizza"
prompt = make_prompt(query=query,relevant_text=get_best_match(query,3))
print(prompt)


    You are an informative bot and please use the relevant text provided and answer the question.
    Please provide only the recipe. The question is i want recipe of pizza The relevant text is  thoughts
after Him
'
as
they have never yet been thought.
This
is my recipe
:
'Plain living and high thinking^ and this is my warn-
ing:
With high
living yoti
will get exceedingly plain
thinking.
Yours for stomachic rights,
''Frances
E. Willard."
The provided text does not contain any recipes.

**Output:**

This text does not contain any recipes.hem,
stir
well and
let stand
for
24
hours, press out the juice and strain, and let stand over
night,
pour
the
juice
off from
the
sediment and
to
every quart of juice add
i i^ pounds sugar, boil up, let
cool and bottle.
" I have formed a settled conviction
that
the world
is fed too much.
Pastries, cakes, hot bread, rich gra-
vies,
pickles, pepper sauces, salads, tea and coffee are
discarded from my
' bill
of
fare,' and I firmly believe
that they will be

In [52]:
gen_model = genai.GenerativeModel('gemini-1.0-pro')
answer = gen_model.generate_content(prompt)
print(answer.text)

This text does not contain any recipes. Therefore, I cannot provide a recipe for pizza.


In [53]:
query = "i want recipe of cake"
prompt = make_prompt(query=query,relevant_text=get_best_match(query,3))
print(prompt)


    You are an informative bot and please use the relevant text provided and answer the question.
    Please provide only the recipe. The question is i want recipe of cake The relevant text is arge bowl, beat the egg whites until stiff peaks form. Gradually add the granulated sugar, beating until the mixture is glossy and stiff.
4. In a separate bowl, cream together the butter and sugar until light and fluffy.
5. Add the egg yolks one at a time, beating well after each addition.
6. Alternately add the dry ingredients and the milk to the batter, beginning and ending with the dry ingredients.
7. Fold in the stiffly beaten egg whites.
8. Divide the batter between the prepared cake pans and bake for 30-35 minutes, or until a toothpick inserted into the center comes out clean.
9. Let cool completely in the pans before assembling the cake.

**Filling:**

- 1 cup of granulated sugar
- 2 tablespoonfuls of water

**Instructions:**

1. In a small saucepan, combine the sugar and water.
2. Bring 

In [62]:
query = "i want recipe of pudding"
prompt = make_prompt(query=query,relevant_text=get_best_match(query,3))
gen_model = genai.GenerativeModel('gemini-1.0-pro')
answer = gen_model.generate_content(prompt)
answer.text

'**Pudding**\n\nBake a sponge cake in a pan; when cool cut out the center, leaving only a thin shell, and fill with the following mixture:\n\n- 1 pint of milk\n- 2 eggs\n- 1/2 cup of flour\n- pinch of salt\n- 1 cup of sugar\n- 1 lemon juice and zest\n\nCook until thick. Pour into the shell, and before serving cover with whipped cream.\n\n- Mrs. Frank Johnson'

In [32]:
answer.text[8:-3].replace("\n","")

'<div>  <h2>Golden Cream Cake</h2>  <h3>Ingredients:</h3>  <ul>    <li>1 cup sugar</li>    <li>1/2 cup butter</li>    <li>1/2 cup sweet milk</li>    <li>1 1/2 cups flour</li>    <li>Whites of 3 eggs</li>  </ul>  <h3>Instructions:</h3>  <ol>    <li>Cream butter and sugar.</li>    <li>Add milk and flour.</li>    <li>Lastly, whisk in egg whites beaten to a froth.</li>    <li>Bake in 3 layers.</li>  </ol>  <h3>Cream Filling</h3>  <h4>Ingredients:</h4>  <ul>    <li>1 pint thick cream</li>    <li>Vanilla extract (to taste)</li>    <li>1/2 pound blanched and chopped almonds</li>  </ul>  <h4>Instructions:</h4>  <ol>    <li>Beat cream until it resembles ice cream.</li>    <li>Sweeten and flavor with vanilla extract.</li>    <li>Fold in chopped almonds.</li>    <li>Spread between cake layers.</li>  </ol></div>'

In [33]:
json_df = pd.read_json('book_embeddings.json')
json_df

Unnamed: 0,summ_txt,text,embeddings
0,This text provides 4 recipes: Steamed Brown Br...,## 175 CHOICE RECIPES\n\n### Brown Bread\n\n**...,"[-0.033366084000000004, -0.048817173000000005,..."
1,**Summary:** This text provides recipes for th...,cup of molasses\n- 1 teaspoonful of baking sod...,"[-0.026710054, -0.042288337, -0.040157776, -0...."
2,"This text contains recipes for Graham Bread, S...",four hours.\n\n- Mrs. J. P. Odell\n\n**Graham ...,"[-0.025552925, -0.029467182, -0.03180367, -0.0..."
3,**Summary:**\nThis text provides two bread rec...,melted butter\n- Salt to taste\n\nStir in grah...,"[-0.016907776, -0.036389567000000005, -0.04036..."
4,**Summary:**\nThis text provides two recipes f...,ckly-baked loaf.\n\n- Flokenck R. Hartlett**St...,"[-0.029654216, -0.0408084, -0.05168476, -0.025..."
...,...,...,...
670,This text provides a recipe for healthy living...,"fee are\ndiscarded from my\n' bill\nof\nfare,'...","[-0.011222291, -0.040570628000000004, -0.00751..."
671,**Summary:**\nThis text does not contain any r...,".\nEntire wheat\nflour\nbread, vegetables,\nfr...","[-0.012315712000000001, -0.018118693, 0.004109..."
672,"The text contains a recipe for ""Plain living a...","embic\nof\nthe\ndigestive organs\ninto pure,\n...","[-0.01643079, -0.034384463000000004, -0.017329..."
673,The provided text does not contain any recipes...,thoughts\nafter Him\n'\nas\nthey have never y...,"[-0.003992342, -0.026472228, -0.0055747554, -0..."
