In [13]:
import hashlib
import time

def generate_document_id(doc, index=None):
    # Check if the required keys exist in the document and handle empty fields
    course = doc.get('book_name', doc.get('title', 'unknown_document'))  # Bible verse or YouTube video title
    chapter_or_author = doc.get('chapter', doc.get('author', 'unknown'))  # Chapter for Bible or author for YouTube
    text = doc.get('text', '')  # Text for either document
    timestamp = str(time.time())  # Add current timestamp for uniqueness
    
    # Use the first 10 characters of text, safely handling shorter strings
    text_preview = text[:10] if len(text) >= 10 else text
    
    # Combine fields (with index and timestamp for extra uniqueness)
    combined = f"{course}-{chapter_or_author}-{text_preview}-{index}-{timestamp}"
    
    # Generate the MD5 hash from the combined string
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()

    # Extract the first 8 characters of the hash to use as the document ID
    document_id = hash_hex[:8]

    return document_id

In [14]:
import json
with open("document.json", 'r') as f_in:
    documents = json.load(f_in)

In [15]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [16]:
documents[1]

{'book_name': 'Genesis',
 'book': 1,
 'chapter': 1,
 'verse': 2,
 'text': 'And the earth was without form, and void; and darkness [was] upon the face of the deep. And the Spirit of God moved upon the face of the waters.',
 'id': 'e0b90de2'}

In [17]:
from collections import defaultdict

In [18]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [19]:
len(hashes), len(documents)

(36755, 36755)

In [21]:
# to check if all the ids are unique
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

In [23]:
with open('documents_with_ids.json', 'w') as f_out:
    json.dump(documents, f_out, indent=4)

In [63]:
prompt_template = f"""
You emulate a person engaging in Bible study.
Formulate 2 insightful questions this person might ask based on the provided Bible passage and relevant video transcript. 
The questions should encourage deeper understanding of the Bible passage, its teachings, and their application in life situations, incorporating insights from the video content where applicable. 
Each question should be complete and distinct from the provided text or video transcript.

The records:

Bible Passage:
Book: {{book_name}}
Chapter: {{chapter}}
Verse: {{verse}}
Text: {{text}}

Video:
Title: {{title}}
Author: {{author}}
Published Date: {{publish_date}}
Transcript: {{video_text}}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [64]:
from openai import OpenAI
client = OpenAI()

In [55]:
from tqdm.auto import tqdm

results = {}

In [65]:
def generate_questions(doc):
    # Prepare a dictionary with default values for safe formatting
    prompt_data = {
        "section": doc.get("section", "General"),
        "question": doc.get("question", "What would you like to know?"),
        "text": doc.get("text", "No content available."),
        "book_name": doc.get("book_name", "Unknown Book"),
        "chapter": doc.get("chapter", "Unknown Chapter"),
        "verse": doc.get("verse", "Unknown Verse"),
        "title": doc.get("title", "No title provided."),
        "author": doc.get("author", "Unknown Author"),
        "publish_date": doc.get("publish_date", "Unknown Date"),
        "video_text": doc.get("video_text", "No video transcript provided.")
    }

    # Format the prompt using the prepared dictionary
    prompt = prompt_template.format(**prompt_data)

    # Call the model to get the response
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    # Extract the JSON response
    json_response = response.choices[0].message.content
    return json_response


In [66]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/36755 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [67]:
results

{'512fda27': '[\n    "What does the phrase \'In the beginning\' signify about time and existence in relation to God as the creator?",\n    "How does the creation of \'heaven and earth\' in Genesis 1:1 establish God\'s sovereignty and authority over all creation?",\n    "In what ways can we reflect on our daily lives to recognize and acknowledge God\'s creative power as depicted in this verse?",\n    "How does understanding the context of this verse inform our beliefs about the relationship between faith and science regarding the origins of the universe?",\n    "What implications does Genesis 1:1 have for our understanding of purpose and meaning in life, as suggested by the patterns of creation that follow in the chapter?"\n]',
 'e0b90de2': '[\n    "How does the description of the earth as \'without form and void\' in Genesis 1:2 reflect the concept of chaos and the need for divine order in our lives today?",\n    "In what ways can we interpret the movement of the \'Spirit of God\' as a

In [72]:
len(results)

1322

In [71]:
with open('/root/practice/logos/data/video_data.json', 'r') as f:
    video_json = json.load(f)

In [74]:
video_json[0]

{'video_id': 'nRTy2olkOOQ',
 'title': 'Basileia = Kingdom 🏰 (which includes people, land, and king or queen who rules)',
 'publish_date': '2024-10-03T00:00:00',
 'author': 'BibleProject',
 'text': "basala may your basala come may your will be done on the land as it is in the skies basala means Kingdom which includes a people a land and a king or queen who rules in the Bible God is the cosmic King and all creation is his kingdom his basa God also appoints humans as co-rulers but when they refuse to lead with God's wisdom they always spiral into corruption violence and death when Jesus says the bosela of God is near he's declaring that God's kingdom is here to challenge all impostor kingdoms offering hope for everyone suffering under the brutality and greed of human Empires Jesus proclaims that God's basala is breaking in as a kingdom of Justice empowered by love rather than experiencing corruption and death its citizens find true peace and the good life the Bosa of the world has become 

In [92]:
for doc in tqdm(video_json): 
    doc_id = doc['video_id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

100%|██████████| 5653/5653 [26:59<00:00,  3.49it/s]  


In [93]:
import pickle

with open('results2.bin', 'wb') as f_in:
    pickle.dump(results, f_in)

In [78]:
import pickle

with open('results.bin', 'wb') as f_in:
    pickle.dump(results, f_in)

In [95]:
import pickle

# Load results from both pickle files
with open('results.bin', 'rb') as f:
    results = pickle.load(f)

with open('results2.bin', 'rb') as f:
    results2 = pickle.load(f)

# If both are dictionaries, you can merge them
combined_results = {**results, **results2}

# Save the combined results to a new file
with open('combined_results.bin', 'wb') as f:
    pickle.dump(combined_results, f)

print("Pickle files combined successfully! 🎉")

Pickle files combined successfully! 🎉


In [96]:
parsed_result = {}

for doc_id, json_questions in combined_results.items():
    parsed_result[doc_id] = json.loads(json_questions)

In [97]:
doc_index = {d['id']: d for d in documents}

In [99]:
doc_index['512fda27']

{'book_name': 'Genesis',
 'book': 1,
 'chapter': 1,
 'verse': 1,
 'text': '¶ In the beginning God created the heaven and the earth.',
 'id': '512fda27'}

In [100]:
final_results = []

for doc_id, questions in parsed_result.items():
    for q in questions:
        final_results.append((q, doc_id))

In [103]:
final_results

[("How does the concept of 'creation' in Genesis 1:1 shape our understanding of God's nature and His relationship with the universe, and what implications does this have for our daily lives and our view of the world around us?",
  '512fda27'),
 ("Considering the phrase 'In the beginning,' how can we apply the foundational idea of beginnings in our personal journeys of faith, particularly in moments of transition or new challenges?",
  '512fda27'),
 ("How does the imagery of the earth being 'without form and void' inform our understanding of God's creative process, and what might this tell us about the importance of order and purpose in our own lives?",
  'e0b90de2'),
 ("In what ways can we draw parallels between the 'darkness' mentioned in Genesis 1:2 and the challenges or uncertainties we face today, and how can we seek to invite the 'Spirit of God' into those situations for guidance and clarity?",
  'e0b90de2'),
 ('questions', 'f57f830a'),
 ("How does the distinction made between lig

In [101]:
import pandas as pd

In [104]:
df = pd.DataFrame(final_results, columns=['question', 'document'])

In [105]:
df.to_csv('questions.csv', index=False)

In [106]:
!head questions.csv

question,document
"How does the concept of 'creation' in Genesis 1:1 shape our understanding of God's nature and His relationship with the universe, and what implications does this have for our daily lives and our view of the world around us?",512fda27
"Considering the phrase 'In the beginning,' how can we apply the foundational idea of beginnings in our personal journeys of faith, particularly in moments of transition or new challenges?",512fda27
"How does the imagery of the earth being 'without form and void' inform our understanding of God's creative process, and what might this tell us about the importance of order and purpose in our own lives?",e0b90de2
"In what ways can we draw parallels between the 'darkness' mentioned in Genesis 1:2 and the challenges or uncertainties we face today, and how can we seek to invite the 'Spirit of God' into those situations for guidance and clarity?",e0b90de2
questions,f57f830a
"How does the distinction made between light and darkness in Genesis 1: