In [None]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get the OpenAI API key
openai_key = os.getenv('GITHUB_TOKEN')

In [None]:
from langchain_community.document_loaders import PyPDFLoader

# arr = ["Kos", "Kreta", "Rodas"]
arr = ["Kos"]
merged_documents =[]

for i in arr:
  file_path = "./data/"+i+".pdf"
  loader = PyPDFLoader(file_path)
  document = loader.load()
  merged_documents += document

print(merged_documents[3].page_content[:100])

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(merged_documents)

print(f"Split blog post into {len(all_splits)} sub-documents.")

In [None]:
### My Inserted code  - Do not run###

from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS  # or Chroma, or other vector stores
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def create_vectorstore(documents, embedding_model_name='all-MiniLM-L6-v2'):
    """Creates a vector store from documents."""
    try:
        embedding_function = SentenceTransformerEmbeddings(model_name=embedding_model_name)
        vectorstore = FAISS.from_documents(documents, embedding_function)
        return vectorstore
    except Exception as e:
        print(f"Error creating vector store: {e}")
        return None

def query_vectorstore(vectorstore, query_text, k=5):
    """Queries the vector store and prints the results."""
    try:
        if vectorstore:
            results = vectorstore.similarity_search(query_text, k=k)
            if results:
                for doc in results:
                    print(doc.page_content)
            else:
                print("No results found.")
        else:
            print("Vector store is not initialized.")
    except Exception as e:
        print(f"Error querying vector store: {e}")

if __name__ == "__main__":
    file_paths = ['documentas-1.docx', 'documentas-2.docx']

    # Load and split documents
    
    # Create vector store
    vectorstore = create_vectorstore(all_splits)

    # Query the vector store
    query_vectorstore(vectorstore, "Kuriame mieste daugiau ežerų?", k=5)


In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=openai_key, base_url="https://models.inference.ai.azure.com")

from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

document_ids = vector_store.add_documents(documents=all_splits)

# visas procesas iki šios vietos vadinamas ingestion pipeline. Tai yra dokumentų paruošimas analizei.

In [None]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

assert len(example_messages) == 1
print(example_messages[0].content)

In [None]:
from langchain.llms import OpenAI

llm = OpenAI(api_key=openai_key, base_url="https://models.inference.ai.azure.com", model="gpt-4o")

question = "Kuo įdomus krokosas?"

retrieved_docs = vector_store.similarity_search(question)
docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
promptAnswer = prompt.invoke({"question": question, "context": docs_content})
answer = llm.invoke(promptAnswer)
print(answer)