In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
loader = PyPDFLoader("Atomic habits.pdf")
pages = loader.load_and_split()
print("PDF loaded successfully!")

PDF loaded successfully!


In [3]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
)

In [4]:
texts = []
for page in pages:
    cleaned_content = page.page_content.replace('\t', ' ').replace('\n', ' ').strip().lower()
    page_chunks = text_splitter.create_documents([cleaned_content])
    texts.extend(page_chunks)

print("Text split successfully!")

Text split successfully!


In [5]:
texts[0].page_content

'an imprint of penguin random house llc 375 hudson street new york, new york 10014 copyright © 2018 by james clear penguin supports copyright. copyright fuels creativity, encourages diverse voices, promotes free speech, and creates a vibrant culture. thank you for buying an authorized edition of this book and for complying with copyright laws by not reproducing, scanning, or distributing any part of it in any form without permission. you are supporting writers and allowing penguin to continue to publish books for every reader.'

In [6]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [7]:
instructor_embeddings = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-xl",
    model_kwargs={"device": "cpu"}
)
print("HuggingFaceInstructEmbeddings loaded successfully!")

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512
HuggingFaceInstructEmbeddings loaded successfully!


  model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))


In [11]:
import os
import json 
output_dir = 'embeddings_output'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [12]:
embeddings_file = os.path.join(output_dir, 'embeddings.txt')

In [13]:
with open(embeddings_file, 'w') as file:
    for idx, text_chunk in enumerate(texts):
        embedding = instructor_embeddings.embed_documents([text_chunk.page_content])[0]
        embedding_data = {
            "id - ": idx + 1,  # Unique ID for each text chunk
            "text - ": text_chunk.page_content,
            "embedding - ": embedding  # Convert to list for JSON serialization
        }
        file.write(json.dumps(embedding_data) + '\n')  # Write each embedding to file

print(f"Embeddings saved successfully in {embeddings_file}")

In [1]:
# !pip install psycopg2-binary

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl.metadata (4.6 kB)
Downloading psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   --------- ------------------------------ 0.3/1.2 MB ? eta -:--:--
   ------------------------------------ --- 1.0/1.2 MB 3.1 MB/s eta 0:00:01
   ---------------------------------------- 1.2/1.2 MB 3.0 MB/s eta 0:00:00
Installing collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.9


In [9]:
import psycopg2

conn = psycopg2.connect(
    dbname="Sample_DataBase",  
    user="postgres",           
    password="root",  
    host="localhost",          
    port="5432"                
)
cursor = conn.cursor()

In [3]:
create_table_query = """
CREATE TABLE IF NOT EXISTS embeddings (
    id SERIAL PRIMARY KEY,
    text TEXT NOT NULL,
    embedding JSONB NOT NULL
);
"""
cursor.execute(create_table_query)
conn.commit()

In [5]:
output_dir = 'embeddings_output'
embeddings_file = os.path.join(output_dir, 'embeddings.txt')

In [7]:
with open(embeddings_file, 'r') as file:
    for line in file:
        embedding_data = json.loads(line)
        text = embedding_data["text - "]
        embedding = embedding_data["embedding - "]
        
        insert_query = """
        INSERT INTO embeddings (text, embedding)
        VALUES (%s, %s);
        """
        cursor.execute(insert_query, (text, json.dumps(embedding)))
        conn.commit()

In [11]:
user_input_id = input("Enter the ID of the embedding you want to fetch : ")

fetch_query = "SELECT text, embedding FROM embeddings WHERE id = %s;"
cursor.execute(fetch_query, (user_input_id,))

result = cursor.fetchone()

if result:
    text, embedding = result
    print(f"Text : {text}")
    print(f"Embedding : {embedding}")
else:
    print(f"No data found for ID {user_input_id}")

cursor.close()
conn.close()

Text : an imprint of penguin random house llc 375 hudson street new york, new york 10014 copyright © 2018 by james clear penguin supports copyright. copyright fuels creativity, encourages diverse voices, promotes free speech, and creates a vibrant culture. thank you for buying an authorized edition of this book and for complying with copyright laws by not reproducing, scanning, or distributing any part of it in any form without permission. you are supporting writers and allowing penguin to continue to publish books for every reader.
Embedding : [0.027030829340219498, 0.02395225688815117, 0.05573349818587303, -0.08260665088891983, -0.059488993138074875, -0.04456246271729469, -0.03062821924686432, -0.011582834646105766, -0.03949225693941116, -0.05209291726350784, 0.02694796770811081, 0.0699966624379158, -0.046558182686567307, -0.10709135979413986, 0.000963636499363929, 0.0133952172473073, 0.030984269455075264, -0.03768974542617798, 0.002005885588005185, -0.016284575685858727, -0.03526075

In [None]:
cursor.close()
conn.close()
print("Embeddings successfully stored in PostgreSQL!")