In [None]:
import os
import json
import psycopg2
from langchain_community.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import PGVector

In [None]:
# Load PDF and Split Text
loader = PyPDFLoader("Atomic habits.pdf")
pages = loader.load_and_split()
print("PDF loaded successfully!")

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
)

texts = []
for page in pages:
    # Clean text and split into chunks
    cleaned_content = page.page_content.replace('\t', ' ').replace('\n', ' ').strip().lower()
    page_chunks = text_splitter.create_documents([cleaned_content])
    texts.extend(page_chunks)

print("Text split successfully!")

In [None]:
# Load HuggingFace Embeddings
instructor_embeddings = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-xl",
    model_kwargs={"device": "cpu"}
)
print("HuggingFaceInstructEmbeddings loaded successfully!")

In [None]:
# Save Embeddings to File
# NOTE: Embeddings are saved to a file for testing purposes
# This step is used to validate whether embeddings generation is working correctly and for later use.
output_dir = 'embeddings_output'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
embeddings_file = os.path.join(output_dir, 'embeddings.txt')

with open(embeddings_file, 'w') as file:
    for idx, text_chunk in enumerate(texts):
        embedding = instructor_embeddings.embed_documents([text_chunk.page_content])[0]
        embedding_data = {
            "id - ": idx + 1,
            "text - ": text_chunk.page_content,
            "embedding - ": embedding 
        }
        file.write(json.dumps(embedding_data) + '\n')

print(f"Embeddings saved successfully in {embeddings_file}")

In [None]:
# Connect to PostgreSQL and Create Tables
# NOTE: If PGVector installation issues persist, store embeddings directly in PostgreSQL using JSON format.
conn = psycopg2.connect(
    dbname="Sample_DataBase",  
    user="postgres",           
    password="root",  
    host="localhost",          
    port="5432"                
)
cursor = conn.cursor()

# Create table for JSON storage
create_table_query = """
CREATE TABLE IF NOT EXISTS embeddings (
    id SERIAL PRIMARY KEY,
    text TEXT NOT NULL,
    embedding JSONB NOT NULL
);
"""
cursor.execute(create_table_query)
conn.commit()

# Insert Embeddings into PostgreSQL
with open(embeddings_file, 'r') as file:
    for line in file:
        embedding_data = json.loads(line)
        text = embedding_data["text - "]
        embedding = embedding_data["embedding - "]
        
        insert_query = """
        INSERT INTO embeddings (text, embedding)
        VALUES (%s, %s);
        """
        cursor.execute(insert_query, (text, json.dumps(embedding)))
        conn.commit()

print("Embeddings inserted into PostgreSQL successfully!")

In [None]:
# Create Table for PGVector
# NOTE: The PGVector table is used for vector-based storage if PGVector installation issues are resolved.
create_table_query_for_pgvector = """
CREATE TABLE IF NOT EXISTS embeddings_using_pgvector (
    id SERIAL PRIMARY KEY,
    text TEXT NOT NULL,
    embedding vector(768)
);
"""
cursor.execute(create_table_query_for_pgvector)
conn.commit()

In [None]:
# User-Defined Document Class
# NOTE: Created to handle embedding data for PGVector, as it requires specific document structures.
class Document:
    def __init__(self, text, embedding):
        self.page_content = text
        self.metadata = {"embedding": embedding}

In [None]:
# Load Embeddings from File and Create PGVector Documents
docs = []
a = 1
with open(embeddings_file, 'r') as file:
    for line in file:
        embedding_data = json.loads(line)
        text = embedding_data["text - "]
        embedding = embedding_data["embedding - "]
        docs.append(Document(text, embedding)) 
        print(f"No - {a}")
        a += 1

In [None]:
# Initialize PGVector
COLLECTION_NAME = "Atomic Habits PDF"
CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver="psycopg2",
    host="localhost",
    port=5432,
    database="Sample_DataBase",
    user="postgres",
    password="root",
)

In [None]:
# Create PGVector Store
# NOTE: PGVector requires the documents list and embeddings to be set up properly.
db = PGVector.from_documents(
    embedding=instructor_embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    pre_delete_collection=True
)

# Initialize PGVector for Querying
pgvector_docsearch = PGVector(
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    embedding_function=instructor_embeddings,
)

def run_query_pgvector(docsearch, query, top_k=4):
    docs = docsearch.similarity_search(query, k=top_k)
    results = [doc.page_content for doc in docs]
    return results

In [None]:
# Run Query on PGVector
query = "How to build better habits?"
results = run_query_pgvector(pgvector_docsearch, query)
for i, result in enumerate(results, 1):
    print(f"Result {i}: {result}\n")

In [None]:
# Fetch Data from PostgreSQL by ID
user_input_id = input("Enter the ID of the embedding you want to fetch: ")

fetch_query = "SELECT text, embedding FROM embeddings WHERE id = %s;"
cursor.execute(fetch_query, (user_input_id,))

result = cursor.fetchone()

if result:
    text, embedding = result
    print(f"Text: {text}")
    print(f"Embedding: {embedding}")
else:
    print(f"No data found for ID {user_input_id}")

cursor.close()
conn.close()
print("Cursor successfully closed!")