<a href="https://colab.research.google.com/github/RERobbins/data_science_266_sandbox/blob/main/inspecting_prompts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [1]:
!pip install --quiet python-dotenv
!pip install --quiet langchain
!pip install --quiet lark
!pip install --quiet chromadb
!pip install --quiet openai
!pip install --quiet cohere
!pip install --quiet tiktoken
!pip install --quiet pypdf
!pip install --quiet unstructured

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.9/108.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m402.8/402.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.4/58.4 kB[0m [31m3.6 MB/s[0m eta 

In [2]:
import os
import textwrap
import openai
import cohere

import pandas as pd

from dotenv import load_dotenv, find_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.llms import Cohere
from langchain.document_loaders import PyPDFLoader, UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [6]:
load_dotenv('/content/drive/MyDrive/.env')
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
COHERE_API_KEY = os.environ["COHERE_API_KEY"]

In [7]:
openai_embeddings_model = OpenAIEmbeddings ()
cohere_embeddings_model = CohereEmbeddings(truncate='None')
cohere_multilingual_embeddings_model = CohereEmbeddings(model="embed-multilingual-v2.0", truncate='None')

In [8]:
LLM_SOURCE="OpenAI"  #set this to either "OpenAI" or "Cohere"

if LLM_SOURCE=="OpenAI":
  embeddings_model=openai_embeddings_model
else:
  embeddings_model=cohere_embeddings_model

if LLM_SOURCE=="OpenAI":
  llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
  # llm = ChatOpenAI(model="gpt-4", temperature=0)

else:
  llm = Cohere(model="command", temperature=0)

In [9]:
def get_chunks (url, organization, title, chunk_size=1000, chunk_overlap=100):

    """
    This function takes a url to an organization's web page, organization name,
    and document title and returns chunks constructed from the target url.
    The function adds the url, the organization name and the document title
    as metadata to the chunks.

    Parameters:
    url (string): Target page.
    organization (string): Organization name.
    title: Document title.
    chunk_size (int, optional): Chunk size, default is 1000 characters.
    chunk_overlap (int, optional): Chunk overlap, default is 10 characters.

    Returns:
    list of chunks
    """

    # Use PyPDFLoader for pdf targets, otherwise UnstructuredURLLoader
    if os.path.splitext(url)[1] == ".pdf":
      loader = PyPDFLoader(url)
    else:
      loader = UnstructuredURLLoader(urls=[url])

    # Load the documents and add organization metadata field.
    # Increment page count metadata by one so it's not zero-based.

    documents = loader.load()
    for document in documents:
      metadata = document.metadata
      metadata['url'] = url
      metadata['organization'] = organization
      metadata['title'] = title
      if metadata.get('page', None) is not None:
        metadata['page'] += 1

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                   chunk_overlap=chunk_overlap)

    return text_splitter.split_documents(documents)

def explore_documents (documents):
  block_indent = "   "
  metadata=documents[0].metadata
  content=documents[0].page_content[:200] + ". . ."
  print(f"{metadata['organization']} {metadata['title']} {len(documents)} chunks")
  print("Truncated First chunk:")
  print(textwrap.fill(content,
                      initial_indent=block_indent,
                      subsequent_indent=block_indent,
                      replace_whitespace=True))
  print()

In [10]:
policy_data = [
    ("Apple", "Privacy Policy", "https://www.apple.com/legal/privacy/pdfs/apple-privacy-policy-en-ww.pdf"),
    ("Google", "Privacy Policy", "https://static.googleusercontent.com/media/www.google.com/en//intl/en/policies/privacy/google_privacy_policy_en.pdf"),
    ("Meta", "Privacy Policy", "https://about.fb.com/wp-content/uploads/2022/07/Privacy-Within-Metas-Integrity-Systems.pdf"),
    ("TikTok", "Privacy Policy", "https://www.tiktok.com/legal/page/us/privacy-policy/en"),
    ("Threads", "Privacy Policy", "https://terms.threads.com/privacy-policy")
]

columns = ['organization', 'title', 'url']

policy_df = pd.DataFrame(policy_data, columns=columns)

In [11]:
chunks=[]

for row in policy_df.itertuples(index=False):
  policy_chunks = get_chunks(row.url, row.organization, row.title)
  explore_documents(policy_chunks)
  chunks += policy_chunks

Apple Privacy Policy 33 chunks
Truncated First chunk:
   Apple Privacy Policy Apple’s Privacy Policy describes how Apple
   collects, uses, and shares your personal data. Updated December 22,
   2022 In addition to this Privacy Policy, we provide data and
   privacy. . .

Google Privacy Policy 32 chunks
Truncated First chunk:
   Privacy Policy Last modified: December 18, 2017 ( view archived
   versions ) (The hyperlinked examples are available at the end of
   this document.) There are many different ways you can use our
   services . . .

Meta Privacy Policy 78 chunks
Truncated First chunk:
   July 2022   Privacy within Meta’s   Integrity Systems   Why user
   rights are at the center   of our safety and security approach. . .



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


TikTok Privacy Policy 41 chunks
Truncated First chunk:
   How TikTok is supporting our community through COVID-19  U.S.
   Privacy Policy  Last updated: May 22, 2023  This Privacy Policy
   applies to TikTok services (the “Platform”), which include TikTok
   apps, w. . .

Threads Privacy Policy 26 chunks
Truncated First chunk:
   🤝Legal  Privacy Policy  Effective date: April 17, 2023  At Threads,
   we take your privacy seriously. Please read this Privacy Policy to
   learn how we treat your personal data. By using or accessing Thre.
   . .



In [12]:
docs = [
    Document(
        page_content="A bunch of scientists bring back dinosaurs and mayhem breaks loose",
        metadata={"year": 1993, "rating": 7.7, "genre": "science fiction"},
    ),
    Document(
        page_content="Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
        metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2},
    ),
    Document(
        page_content="A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
        metadata={"year": 2006, "director": "Satoshi Kon", "rating": 8.6},
    ),
    Document(
        page_content="A bunch of normal-sized women are supremely wholesome and some men pine after them",
        metadata={"year": 2019, "director": "Greta Gerwig", "rating": 8.3},
    ),
    Document(
        page_content="Toys come alive and have a blast doing so",
        metadata={"year": 1995, "genre": "animated"},
    ),
    Document(
        page_content="Three men walk into the Zone, three men walk out of the Zone",
        metadata={
            "year": 1979,
            "rating": 9.9,
            "director": "Andrei Tarkovsky",
            "genre": "science fiction",
            "rating": 9.9,
        },
    ),
]
vectorstore = Chroma.from_documents(docs, embeddings_model)

In [13]:
metadata_field_info = [
    AttributeInfo(
        name="genre",
        description="The genre of the movie",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="year",
        description="The year the movie was released",
        type="integer",
    ),
    AttributeInfo(
        name="director",
        description="The name of the movie director",
        type="string",
    ),
    AttributeInfo(
        name="rating", description="A 1-10 rating for the movie", type="float"
    ),
]
document_content_description = "Brief summary of a movie"
retriever = SelfQueryRetriever.from_llm(
    llm, vectorstore, document_content_description, metadata_field_info, verbose=True
)

In [14]:
# This example only specifies a relevant query
retriever.get_relevant_documents("What are some movies about dinosaurs")



query='dinosaurs' filter=None limit=None


[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'genre': 'science fiction', 'rating': 7.7, 'year': 1993}),
 Document(page_content='Toys come alive and have a blast doing so', metadata={'genre': 'animated', 'year': 1995}),
 Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'rating': 9.9, 'year': 1979}),
 Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'director': 'Satoshi Kon', 'rating': 8.6, 'year': 2006})]

In [15]:
# This example only specifies a filter
retriever.get_relevant_documents("I want to watch a movie rated higher than 8.5")

query=' ' filter=Comparison(comparator=<Comparator.GT: 'gt'>, attribute='rating', value=8.5) limit=None


[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'rating': 9.9, 'year': 1979}),
 Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'director': 'Satoshi Kon', 'rating': 8.6, 'year': 2006})]

In [16]:
# This example specifies a query and a filter
retriever.get_relevant_documents("Has Greta Gerwig directed any movies about women")

query='women' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='director', value='Greta Gerwig') limit=None


[Document(page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them', metadata={'director': 'Greta Gerwig', 'rating': 8.3, 'year': 2019})]

In [17]:
# This example specifies a composite filter
retriever.get_relevant_documents(
    "What's a highly rated (above 8.5) science fiction film?"
)

query='science fiction' filter=Comparison(comparator=<Comparator.GT: 'gt'>, attribute='rating', value=8.5) limit=None


[Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'director': 'Satoshi Kon', 'rating': 8.6, 'year': 2006}),
 Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'rating': 9.9, 'year': 1979})]

In [18]:
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    enable_limit=True,
    verbose=True,
)

In [19]:
# This example only specifies a relevant query
retriever.get_relevant_documents("what are two movies about dinosaurs")

query='dinosaurs' filter=None limit=2


[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'genre': 'science fiction', 'rating': 7.7, 'year': 1993}),
 Document(page_content='Toys come alive and have a blast doing so', metadata={'genre': 'animated', 'year': 1995})]

In [20]:
vectorstore.delete_collection()
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings_model)
assert vectorstore._collection.count() == len(chunks)

In [21]:
metadata_field_info=[
    AttributeInfo(
        name="organization",
        description="The organization, company, entity or institution that the document relates to",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="title",
        description="The title of the document",
        type="string",
    ),
    AttributeInfo(
        name="url",
        description="The url for the document",
        type="string",
    ),
]
document_content_description = "A policy"
retriever = SelfQueryRetriever.from_llm(llm, vectorstore, document_content_description, metadata_field_info, verbose=True)

In [None]:
retriever.get_relevant_documents("Does Apple use cookies?")

query='Apple cookies' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='organization', value='Apple') limit=None


[Document(page_content="To exercise privacy rights for your child's information, visit the Apple Data and Privacy page at privacy.apple.com and sign in to their account. Cookies and Other Technologies Apple’s websites, online services, interactive applications, and advertisements may use “cookies” and other technologies such as web beacons. These technologies help us to better understand user behavior including for security and fraud prevention purposes, tell us which parts of our websites people have visited, and facilitate and measure the effectiveness of advertisements and web searches. •Communications Cookies. These cookies are used to enable network traffic to and from Apple’s systems, including by helping us detect any errors. •Strictly Necessary Cookies. These cookies are set as required to provide a specific feature or service that you have accessed or requested. For example, they allow us to display our websites in the proper format and language, to authenticate and verify you

In [None]:
retriever.get_relevant_documents("Do Apple or Microsoft use cookies?")

query='cookies' filter=Operation(operator=<Operator.OR: 'or'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='organization', value='Apple'), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='organization', value='Microsoft')]) limit=None


[Document(page_content='websites in the proper format and language, to authenticate and verify your transactions, and to preserve the contents of your Bag when shopping online at apple.com.  •Other Cookies. These cookies are used to understand how visitors interact with our websites and online services, including by helping us to assess the effectiveness of advertisements and web searches. Apple also uses these cookies to remember choices you make while browsing, so we can provide you with a customized experience.  If you prefer that Apple not use cookies, we provide you with the means to disable their use. If you want to disable cookies and you’re using the Safari web browser, choose “Block all cookies” in Safari’s privacy settings. If you are using a different browser, check with your provider to find out how to disable cookies. Certain features of the Apple website may not be available if all cookies are disabled. In addition to cookies, Apple uses other technologies that help us ac

In [22]:
retriever.get_relevant_documents("Does Microsoft use cookies?")

query='Microsoft cookies' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='organization', value='Microsoft') limit=None


[]

In [27]:
from langchain import PromptTemplate
import inspect

In [25]:
PromptTemplate

langchain.prompts.prompt.PromptTemplate

In [30]:
print(inspect.getdoc(PromptTemplate))

A prompt template for a language model.

A prompt template consists of a string template. It accepts a set of parameters
from the user that can be used to generate a prompt for a language model.

The template can be formatted using either f-strings (default) or jinja2 syntax.

Example:

    .. code-block:: python

        from langchain import PromptTemplate

        # Instantiation using from_template (recommended)
        prompt = PromptTemplate.from_template("Say {foo}")
        prompt.format(foo="bar")

        # Instantiation using initializer
        prompt = PromptTemplate(input_variables=["foo"], template="Say {foo}")


In [34]:
help(PromptTemplate.save)

Help on function save in module langchain.schema.prompt_template:

save(self, file_path: 'Union[Path, str]') -> 'None'
    Save the prompt.
    
    Args:
        file_path: Path to directory to save prompt to.
    
    Example:
    .. code-block:: python
    
        prompt.save(file_path="path/prompt.yaml")



In [36]:
PromptTemplate.save("pt.json")

TypeError: ignored