In [1]:
!pip install langchain_community langchain_pinecone langchain_openai langchain-text-splitters langchain-cli[serve] langserve[all] pinecone-client langchain-community cohere openai markdownify langchain-upstage rank_bm25 python-dotenv langchain-chroma langchain

Collecting langchain_community
  Downloading langchain_community-0.2.12-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain_pinecone
  Downloading langchain_pinecone-0.1.3-py3-none-any.whl.metadata (1.7 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.1.21-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-text-splitters
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting cohere
  Downloading cohere-5.8.1-py3-none-any.whl.metadata (3.4 kB)
Collecting openai
  Downloading openai-1.40.6-py3-none-any.whl.metadata (22 kB)
Collecting markdownify
  Downloading markdownify-0.13.1-py3-none-any.whl.metadata (8.5 kB)
Collecting langchain-upstage
  Downloading langchain_upstage-0.1.7-py3-none-any.whl.metadata (3.3 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting python-dotenv
  Downloadi

In [3]:
#@title set API key
from pprint import pprint
import os

import warnings
warnings.filterwarnings('ignore')

from IPython import get_ipython

upstage_api_key_env_name = 'UPSTAGE_API_KEY'
def load_env():
    if 'google.colab' in str(get_ipython()):
        # Running in Google Colab
        from google.colab import userdata
        upstage_api_key = userdata.get(upstage_api_key_env_name)
        return os.environ.setdefault('UPSTAGE_API_KEY', upstage_api_key)
    else:
        # Running in local Jupyter Notebook
        from dotenv import load_dotenv
        load_dotenv()
        return os.environ.get(upstage_api_key_env_name)

UPSTAGE_API_KEY = load_env()

In [8]:
from google.colab import userdata

os.environ['PINECONE_API_KEY'] = userdata.get('PINECONE_API_KEY')
os.environ['PINECONE_ENVIRONMENT'] = userdata.get('PINECONE_ENVIRONMENT')
os.environ['PINECONE_INDEX_NAME'] = userdata.get('PINECONE_INDEX_NAME')

PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
PINECONE_ENVIRONMENT = os.environ["PINECONE_ENVIRONMENT"]
PINECONE_INDEX_NAME = os.environ["PINECONE_INDEX_NAME"]

In [14]:
from langchain_upstage import UpstageLayoutAnalysisLoader


layzer = UpstageLayoutAnalysisLoader("kim-tse-2008.pdf", output_type="html")
# For improved memory efficiency, consider using the lazy_load method to load documents page by page.
docs = layzer.load()  # or layzer.lazy_load()

In [15]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

# 2. Split
text_splitter = RecursiveCharacterTextSplitter.from_language(
    chunk_size=1000, chunk_overlap=100, language=Language.HTML
)
splits = text_splitter.split_documents(docs)
print("Splits:", len(splits))

Splits: 130


In [35]:
from langchain_chroma import Chroma

# 3. Embed & indexing
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=UpstageEmbeddings(model="solar-embedding-1-large"),
)

In [36]:
retriever = vectorstore.as_retriever()
result_docs = retriever.invoke("What is Bug Classification?")
print(len(result_docs))
print(result_docs[0].page_content[:100])

4
<p id='49' data-category='paragraph' style='font-size:16px'>Similar in spirit to change classificati


In [27]:
from langchain_upstage import UpstageEmbeddings
from langchain_community.chat_models import ChatOpenAI
from langchain_community.embeddings import CohereEmbeddings
from langchain_community.vectorstores import Pinecone
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from pinecone import Pinecone as PineconeClient

pinecone = PineconeClient(api_key=PINECONE_API_KEY,
                         environment=PINECONE_ENVIRONMENT)

embeddings = UpstageEmbeddings(model="solar-embedding-1-large")
vectorstore = Pinecone.from_existing_index(index_name=PINECONE_INDEX_NAME, embedding=embeddings)
retriever = vectorstore.as_retriever()

In [30]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_upstage import ChatUpstage


llm = ChatUpstage()

prompt_template = PromptTemplate.from_template(
    """
    Please provide most correct answer from the following context.
    If the answer is not present in the context, please write "The information is not present in the context."
    ---
    Question: {question}
    ---
    Context: {context}
    """
)
chain = prompt_template | llm | StrOutputParser()

In [37]:
chain.invoke({"question": "What is bug classficiation?", "context": result_docs})

'Bug classification is a process that involves extracting keywords from bug reports or software maintenance requests and using them as features to train a machine learning classifier. The goal of this classification is to place a bug report into a specific category or to find the developer best suited to fix a bug. This work, along with change classification, highlights the potential of using machine learning techniques in software engineering.'