In [None]:
!pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain langchain-cohere langchain-upstage langchain-google-genai langchain-huggingface

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '' # put your langcahin api key
os.environ["LANGCHAIN_PROJECT"] = 'default'
os.environ['GOOGLE_API_KEY'] = '' # put your google api key
os.environ['OPENAI_API_KEY'] = '' # put your open api key
os.environ["UPSTAGE_API_KEY"] = ""
os.environ["PPLX_API_KEY"] = ''
os.environ["COHERE_API_KEY"] = ''


Part 1: Overview

In [None]:
!pip install sentence-transformers

In [None]:
import bs4
from langchain import hub
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_huggingface import HuggingFacePipeline, HuggingFaceEmbeddings
from langchain_cohere import ChatCohere
from langchain_cohere import CohereEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_upstage import ChatUpstage
from langchain_upstage import UpstageEmbeddings

import os
import getpass

#### INDEXING ####

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

# LLM
# Initialize the LLM with the desired model ID and API token
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from langchain.document_loaders import PyPDFLoader

mode = "upstage" # openai, hf, cohere, google, cohere, upstage
if mode == "hf":
    model_id = "skt/A.X-4.0-Light"
    tokenizer = AutoTokenizer.from_pretrained(
        model_id
    )
    model = AutoModelForCausalLM.from_pretrained(model_id)
    pipe = pipeline("text-generation", model=model,
                tokenizer=tokenizer, max_new_tokens=512)

    llm = HuggingFacePipeline(pipeline=pipe)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



elif mode == "cohere":
    #https://dashboard.cohere.com/api-keys #put your cohere API key
    llm = ChatCohere()
    embeddings = CohereEmbeddings(model="embed-english-v3.0",)


elif mode == "openai":
    #put your OpenAI api key
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")


elif mode == "google":
    llm = ChatGoogleGenerativeAI(model="gemini-2.5-pro-preview-03-25",
                temperature=0,
                max_tokens=None,
                timeout=None,
                max_retries=2,
              )
    embeddings = GoogleGenerativeAIEmbeddings(model='embedding-001',
                                              output_dimensionality=1024)

elif mode == "upstage":
    llm = ChatUpstage()
    embeddings = UpstageEmbeddings(model="embedding-query", dimensions=1024)
else:
    raise Exception("no matched mode")



# Load Documents


loader = WebBaseLoader(
    web_paths=[
        "https://n.news.naver.com/article/437/0000378416",
        "https://n.news.naver.com/mnews/hotissue/article/092/0002340014?type=series&cid=2000063",
    ],
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            "div",
            attrs={"class": ["newsct_article _article_body", "media_end_head_title"]},
        )
    ),
    header_template={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
    },
)
docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
retriever = vectorstore.as_retriever()

from langsmith import Client
client = Client(api_key='') #<===your langchain api key!!!!
prompt = client.pull_prompt("rlm/rag-prompt", include_model=True)

#### RETRIEVAL and GENERATION ####


# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Question
rag_chain.invoke("저출생 정책?")

Part 2: Indexing

In [None]:
# Documents
question = "What kinds of pets do I like?"
document = "My favorite pet is a cat."

In [None]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

In [None]:
from langchain_openai import OpenAIEmbeddings
if embeddings == Null:
  embeddings = OpenAIEmbeddings()
query_result = embedding.embed_query(question)
document_result = embeddings.embed_query(document)
len(query_result)


In [None]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_result, document_result)
print("Cosine Similarity:", similarity)

In [None]:
#document loader -- reusing above loader

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(blog_docs)

In [None]:
# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=embeddings)

retriever = vectorstore.as_retriever()

Part 3: Retrieval

In [None]:
# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=embeddings)


retriever = vectorstore.as_retriever(search_kwargs={"k": 1})
docs = retriever.get_relevant_documents("저출생 정책?")
len(docs)

Part 4: Generation

In [None]:
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

In [None]:

# Chain
chain = prompt | llm
# Run
chain.invoke({"context":docs,"question":저출생 정책?"})


In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("저출생 정책?")