In [1]:
import os
import json
import base64
import openai
from openai import OpenAI
from dotenv import find_dotenv, load_dotenv
from uuid import uuid4
from random import randint
from utils.get_openai_api_key import get_test_key
from utils.get_postgres_connection import _conn_open
from utils.load_json import load_json
from rich.console import Console
import psycopg2

In [2]:
print("Connecting to PostgreSQL...")
conn = psycopg2.connect(
    database="postgres",
    user="postgres",
    password="postgres",
    host="localhost",
)

cur = conn.cursor()

print("Successfully connected to PostgreSQL.")

Connecting to PostgreSQL...
Successfully connected to PostgreSQL.


In [3]:
console = Console()

In [4]:
load_dotenv(find_dotenv())
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if OPENAI_API_KEY is None:
    OPENAI_API_KEY = get_test_key()
openai.api_key = OPENAI_API_KEY

In [5]:
client = OpenAI()
# Choose a model
model = "text-embedding-ada-002"
console.print(f"[dark_orange bold]{OPENAI_API_KEY[:12]}[/]")

In [6]:
def get_embedding(text, model="text-embedding-3-small"):

    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

In [7]:
def decode_image(encoded_image_data, image_fimename):
    decoded_image_data = base64.b64decode(encoded_image_data)
    file_name_path = f"./{image_fimename}"
    with open(file_name_path, "wb") as new_image_file:
        new_image_file.write(decoded_image_data)

In [8]:
FILENAME = "meetup.json"

In [9]:
def embed_and_load(filename=FILENAME):

    json_data = load_json(FILENAME)
    print(f"Number of items: {len(json_data)}")
    encoded_image_data = None
    file_id = str(uuid4())
    for i in range(len(json_data)):

        # if text_html then must be parsed for correct HTML quotes
        element_id = json_data[i]["element_id"]
        element_type = json_data[i]["type"].upper()
        if "parent_id" in json_data[i]:
            parent_id = json_data[i]["parent_id"]
        else:
            parent_id = ""
        if "page_number" in json_data[i]:
            page_number = json_data[i]["page_number"]
        else:
            page_number = ""

        filename = json_data[i]["metadata"]["filename"]
        element_text = json_data[i]["text"]
        if element_text is not None:
            element_embedding = get_embedding(element_text)
        else:
            element_embedding = ""
        if json_data[i]["type"] == "Image" or json_data[i]["type"] == "Table":
            console.print(
                f"[dark_orange]--------- {element_type.upper()} Item Number  {i + 1}--------[/]"
            )
            el_image_b64 = json_data[i]["metadata"]["image_base64"]
            # console.print(el_image_b64[:40])
        else:
            el_image_b64 = "NONE"

        sql = f"""
            INSERT INTO tbl_doc_elements (
                file_id, element_id, element_text, element_type, parent_id, page_number, image_base64, embedding)
            VALUES 
            ('{file_id}','{element_id}','{element_text}','{element_type}','{parent_id}', 'page_number', '{el_image_b64}', '{element_embedding}')
            RETURNING id;
                """
        console.print(sql)
        try:
            # cur.execute(sql)
            # row = cur.fetchone()
            # id = row[0]
            # console.print(f"{i} - ID is {id}\nFILE_ID: {file_id}")
            conn.commit()
        except Exception as e:
            console.print(e)

In [10]:
files = ["meetup.json"]

In [11]:
for file in files:
    print(file)
    FILENAME = file
    embed_and_load(filename=FILENAME)
    print(f"file {file} loaded")

meetup.json
Number of items: 20


file meetup.json loaded


In [12]:
cur.close()
conn.close()