In [None]:
import os
print(os.getcwd())  # shows current working directory
print(os.listdir()) # shows files/folders in this directory

In [10]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [15]:
def load_and_split_documents(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [None]:
extracted_data = load_and_split_documents('../data')
extracted_data

In [21]:
len(extracted_data)

637

In [22]:
from typing import List
from langchain.schema import Document

def filter_min_docs(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source", "")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [23]:
minimal_docs = filter_min_docs(extracted_data)

In [43]:
def test_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=20,
        length_function=len,
    )
    chunked_texts = text_splitter.split_documents(minimal_docs)
    return chunked_texts

In [42]:
chunked_texts = test_split(minimal_docs)
print(f"Chunk Number: {len(chunked_texts)}")

Chunk Number: 3006


In [6]:
import torch
from transformers import AutoTokenizer, AutoModel
import warnings
warnings.filterwarnings('ignore')

class SimpleEmbeddings:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        print(f"Loading model: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()
        print("✓ Model loaded successfully!")

    def _encode_text(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: value.to(self.device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)

        attention_mask = inputs['attention_mask']
        token_embeddings = outputs.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        embeddings = sum_embeddings / sum_mask

        return embeddings.cpu().numpy().flatten()

    def embed_query(self, text):
        return self._encode_text(text).tolist()

    def embed_documents(self, texts):
        return [self._encode_text(text).tolist() for text in texts]

def create_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = SimpleEmbeddings(model_name=model_name)
    return embeddings

embeddings = create_embeddings()

Loading model: sentence-transformers/all-MiniLM-L6-v2
✓ Model loaded successfully!


In [None]:
vector = embeddings.embed_query("Test sentence for embedding")
len(vector)

384

In [10]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [11]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

In [None]:
from pinecone import Pinecone
