### RAG - Loaders practice

##### Boilerplate code

In [None]:
import langchain
import os
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

load_dotenv()

google_api_key = os.getenv("GOOGLE_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

google_llm = ChatGoogleGenerativeAI(
    temperature=0, 
    model="gemini-2.0-flash", 
    api_key=google_api_key,
    max_tokens=200
)

openai_llm = ChatOpenAI(
    temperature=0, 
    model="gpt-4", 
    api_key=openai_api_key
)

##### TextLoader

In [None]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader('../docs_for_rag/coolie_large.txt')

document = loader.load()

for document in loader.lazy_load():
    print(document)

##### CSVLoader

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(
    '../docs_for_rag/cars.csv'
)

data = loader.load()

# print(data)

for document in data:
    print(document.page_content, "\n")

##### WebBaseLoader

In [None]:
from langchain_community.document_loaders import WebBaseLoader

# loader = WebBaseLoader("https://www.orkut.com/")
# docs = loader.load()

# loader_multiple_pages = WebBaseLoader(
#     ["https://www.orkut.com/", "https://google.com", "https://facebook.com", "https://linkedin.com", "https://x.com"]
# )
# loader.requests_per_second = 5 # works only with lazy load
# docs = loader_multiple_pages.lazy_load()


loader = WebBaseLoader("https://ijcrt.org/papers/IJCRT2104426.pdf")
docs = loader.load()

# print(docs[0])

for doc in docs:
    print(doc, "\n")

USER_AGENT environment variable not set, consider setting it to identify your requests.


##### UnstructuredLoader - Loading Images

In [None]:
from langchain_unstructured import UnstructuredLoader

file_paths = [
    '../docs_for_rag/images.jpeg',
    '../docs_for_rag/nexon_brochure.pdf'
]

try:
    loader = UnstructuredLoader(file_paths)
    docs = loader.load()
    if docs:
        for doc in docs:
            if doc.page_content:
                print(doc.page_content, "\n")
            else:
                print("No text content found in the image")
    else:
        print("No documents were loaded")
except Exception as e:
    print(f"Error: {e}")



### And much more - Refer langchain document loader code webpage