### Working with External Documents in Python

In [None]:
# !pip install langchain-community
# !pip install unstructured
# !pip install pdfminer
# !pip install --upgrade unstructured[pdf]

In [None]:
import os
from langchain_community.document_loaders import UnstructuredPDFLoader

docs_folder = "./docs"
documents = []

for filename in os.listdir(docs_folder):
    filepath = os.path.join(docs_folder, filename)
    print(f"Loading {filepath}..."  )
    loader = UnstructuredPDFLoader(filepath)
    documents.extend(loader.load())


##### Chanking text into small parts can be usefull for processing large documents. Here's how you can do it using Python:

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50)
chanks = splitter.split_documents(documents)

for i, chank in enumerate(chanks):
    print(f"--- Chank {i+1} ---")
    print(chank.page_content)
    print("--------------")


### Embedding Documents

In [None]:
from langchain.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

embedding = OllamaEmbeddings(model="nomic-embed-text")
db = Chroma.from_documents(chanks, embedding, persist_directory="./qa_db")

retriever = db.as_retriever()


In [None]:
embedding

In [None]:
from langchain_ollama import ChatOllama

llm = ChatOllama(
    base_url="http://localhost:11434",
    model = "qwen2.5:7b",
    temperature=0.5
)


#### Retrival QA

In [None]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever  
)

question = "Explain system architecture of Netflix?"
print(qa_chain.run(question))



