## Development with Large Language Models Tutorial – OpenAI, Langchain, Agents, Chroma

> Notes on tutorial by freecodecamp: https://www.youtube.com/watch?v=xZDB1naRUlk
---


In [1]:
import os
import getpass
import tiktoken
import openai
import chainlit as cl
from chatgpt_clone.creds import AZURE_API_BASE, AZURE_API_KEY, AZURE_API_VERSION

In [3]:
AZURE_API_KEY = getpass.getpass("Enter Azure API Key: ")
AZURE_API_BASE = getpass.getpass("Enter Azure Base URL: ")
AZURE_API_VERSION = getpass.getpass("Enter Azure API version: ")

### 1. ChainLit ChatGPT Clone

- see `LLM Concepts/Langchain Tutorials/chatgpt_clone`


### 2. Building a Doc QnA System

![](https://imgur.com/5EL92Tx.png)

#### Some experiments with `chromadb`

In [2]:
import chromadb

In [7]:
chroma_client = chromadb.Client()

2023-09-16 13:28:46 - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


In [8]:
collections = chroma_client.create_collection(name="my_collection")

In [9]:
collections.add(
    documents=["my name is mini", "I am 28 years old", "I like to eat paneer"], # list of actual docs
    metadatas=[{"source": "name"}, {"source": "age"}, {"source": "food"}], # metadata of each doc; one use case is we can o/p the source for each doc
    ids=["id1", "id2", "id3"]
)

/Users/shaunaksen/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [01:18<00:00, 1.06MiB/s]


In [10]:
collections

Collection(name=my_collection)

In [15]:
results = collections.query(
    query_texts=["what is my favorite dish?"],
    n_results=1
)

In [16]:
results

{'ids': [['id3']],
 'distances': [[1.0386817455291748]],
 'metadatas': [[{'source': 'food'}]],
 'embeddings': None,
 'documents': [['I like to eat paneer']]}

#### Chat based on pdf

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import  OpenAIEmbeddings
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.vectorstores import  Chroma
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import AzureOpenAI
from langchain.chat_models import AzureChatOpenAI



In [4]:
embeddings = OpenAIEmbeddings(
    deployment="text-embedding-ada-002",
    model="text-embedding-ada-002",
    openai_api_type='azure',
    openai_api_base=AZURE_API_BASE,
    openai_api_key=AZURE_API_KEY,
    openai_api_version=AZURE_API_VERSION,
    chunk_size=1, max_retries=1e0 
)

In [5]:
llm_chat_gpt_4 = AzureChatOpenAI(deployment_name='gpt-4-32k',
                          model='gpt-4-32k',
                          openai_api_type='azure',
                          openai_api_base=AZURE_API_BASE,
                          openai_api_key=AZURE_API_KEY,
                          openai_api_version=AZURE_API_VERSION,
                          max_retries=2,
                          temperature=0,
                          streaming=True
                          )

llm_gpt_4 = AzureOpenAI(deployment_name='gpt-4-32k',
                          model='gpt-4-32k',
                          openai_api_type='azure',
                          openai_api_base=AZURE_API_BASE,
                          openai_api_key=AZURE_API_KEY,
                          openai_api_version=AZURE_API_VERSION,
                          max_retries=2,
                          temperature=0,
                          streaming=True
                          )

In [6]:
loader = PyPDFLoader("/Users/shaunaksen/Downloads/AIOPs RCA - Company level analysis.pdf")

In [7]:
documents = loader.load()

In [8]:
documents

[Document(page_content='AIOPs Root Cause Analysis Analysis and findings ', metadata={'source': '/Users/shaunaksen/Downloads/AIOPs RCA - Company level analysis.pdf', 'page': 0}),
 Document(page_content='Root Cause Analysis Workflow and Objectives \n', metadata={'source': '/Users/shaunaksen/Downloads/AIOPs RCA - Company level analysis.pdf', 'page': 1}),
 Document(page_content='Alert details: Slack message \n', metadata={'source': '/Users/shaunaksen/Downloads/AIOPs RCA - Company level analysis.pdf', 'page': 2}),
 Document(page_content='Finding the top companies which contribute to the alert -Company: 1077776  had significant contribution to total transactions, avg_tx_bytes, avg_rx_bytes \n-Company: 7626333 also contributed significantly to tx_bytes \n-No other company contributes significantly to the metrics \n', metadata={'source': '/Users/shaunaksen/Downloads/AIOPs RCA - Company level analysis.pdf', 'page': 3}),
 Document(page_content='Finding the top companies which contribute to the a

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

In [11]:
split_docs = text_splitter.split_documents(documents)

[Document(page_content='AIOPs Root Cause Analysis Analysis and findings', metadata={'source': '/Users/shaunaksen/Downloads/AIOPs RCA - Company level analysis.pdf', 'page': 0}),
 Document(page_content='Root Cause Analysis Workflow and Objectives', metadata={'source': '/Users/shaunaksen/Downloads/AIOPs RCA - Company level analysis.pdf', 'page': 1}),
 Document(page_content='Alert details: Slack message', metadata={'source': '/Users/shaunaksen/Downloads/AIOPs RCA - Company level analysis.pdf', 'page': 2}),
 Document(page_content='Finding the top companies which contribute to the alert -Company: 1077776  had significant contribution to total transactions, avg_tx_bytes, avg_rx_bytes \n-Company: 7626333 also contributed significantly to tx_bytes \n-No other company contributes significantly to the metrics', metadata={'source': '/Users/shaunaksen/Downloads/AIOPs RCA - Company level analysis.pdf', 'page': 3}),
 Document(page_content='Finding the top companies which contribute to the alert', met

In [47]:
docsearch = Chroma.from_documents(
    documents, embeddings
)

2023-09-17 10:42:22 - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


In [54]:
chain = RetrievalQAWithSourcesChain.from_chain_type(
        llm_chat_gpt_4,
        chain_type="stuff",
        retriever=docsearch.as_retriever(max_tokens_limit=4097),
        verbose=True
    )

In [55]:
chain("Which company id contributes most to the alert?")



[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m


{'question': 'Which company id contributes most to the alert?',
 'answer': 'Company 1077776 contributes most to the alert, with significant contributions to total transactions, avg_tx_bytes, and avg_rx_bytes.\n',
 'sources': '/Users/shaunaksen/Downloads/AIOPs RCA - Company level analysis.pdf'}