# Setup

In [1]:
import numpy as np
from groq import Groq
import os

from dotenv import load_dotenv
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display, HTML
from langchain_community.document_loaders import DirectoryLoader, TextLoader

In [2]:
load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")

client = Groq(api_key=groq_api_key)
model = "mixtral-8x7b-32768"

loader = DirectoryLoader("./data/", use_multithreading=True, loader_cls=TextLoader)
documents = loader.load()

# V1

In [4]:
# Extract text from each document using the 'page_content' attribute
texts = [doc.page_content for doc in documents]

text_splitter = TokenTextSplitter(
    chunk_size=500,  # 500 tokens is the max
    chunk_overlap=20  # Overlap of N tokens between chunks (to reduce chance of cutting out relevant connected text like middle of sentence)
)

# Join texts into a single string
joined_text = " ".join(texts)
chunks = text_splitter.split_text(joined_text)

chunk_embeddings = []
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
for chunk in chunks:
    chunk_embeddings.append(embedding_function.embed_query(chunk))

ConnectionError: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /gpt-2/encodings/main/vocab.bpe (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x32c68dd00>: Failed to resolve 'openaipublic.blob.core.windows.net' ([Errno 8] nodename nor servname provided, or not known)"))

In [107]:
def contrary_memo_chat(client, model, user_question, relevant_excerpts):
    # Convert the list of relevant excerpts to a single string
    relevant_excerpts_str = "\n\n".join(relevant_excerpts)
    
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are an expert venture analyst. Given the user's question and relevant excerpts from venture research memos, answer the question by including direct quotes from the memos. When using a quote, site the memo that it was from (ignoring the chunk)." 
            },
            {
                "role": "user",
                "content": f"User Question: {user_question}\n\nRelevant Memo Excerpt(s):\n\n{relevant_excerpts_str}",
            }
        ],
        model=model
    )

    response = chat_completion.choices[0].message.content
    return response

user_question = "What does Calm do?"
prompt_embeddings = embedding_function.embed_query(user_question) 
similarities = cosine_similarity([prompt_embeddings], chunk_embeddings)[0] 
closest_similarity_index = np.argmax(similarities) 
most_relevant_chunk = chunks[closest_similarity_index]
display(HTML(most_relevant_chunk))
contrary_memo_chat(client, model, user_question, most_relevant_chunk)

'Calm is a digital health and wellness company that offers various features to support user\'s mental and emotional well-being. The "for work" section of the Calm app includes content aimed at supplementing employee wellness programs. This includes features like sleep aids, meditation sessions, and relaxation exercises (Memo 1). Calm has secured several sessions centered around "chief purpose officer" Jay Shetty, who joined Calm as a popular mindfulness influencer and is now a popularity metric for the app (Memo 1). Calm\'s sessions aim to improve feelings of wellness, reduce stress, and help users relax (Memo 1).\n\nIn addition to these features, Calm offers masterclasses led by experts, sleep aids, and guided meditations, as well as various programs for users to engage with in order to promote mental well-being (Memo 2). Calm has also developed partnerships with companies to offer services to their employees, and has integrated its services into workplace wellness programs (Memo 2).\

# V2

In [3]:
documents = []
for filename in os.listdir("./data/"):
    if filename.endswith(".txt"):
        with open(os.path.join("./data/", filename), 'r', encoding='utf-8') as file:
            content = file.read()
            chunks = text_splitter.split_text(content)
            total_chunks = len(chunks)
            for chunk_num in range(1,total_chunks+1):
                chunk = chunks[chunk_num-1]
                documents.append(Document(page_content=chunk, metadata={"source": "local"}))

print(len(documents))
docsearch = Chroma.from_documents(documents, embedding_function)

NameError: name 'text_splitter' is not defined

In [None]:
def contrary_memo_chat(client, model, user_question, relevant_excerpts):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are an expert venture analyst. Given the user's question, use the relevant excerpts to answer the question in two sentences."
            },
            {
                "role": "user",
                "content": f"User Question: {user_question}. Excerpts: {relevant_excerpts}",
            }
        ],
        model=model
    )

    response = chat_completion.choices[0].message.content
    return response

user_question = "What does Ramp do?"
relevant_excerpts = docsearch.similarity_search(user_question)
# print(relevant_excerpts)
contrary_memo_chat(client, model, user_question, relevant_excerpts)

'Ramp is a company that offers automated expense management, procurement, and payment solutions to businesses of all sizes. It provides features such as complex approval chains, automated memo suggestions, receipt handling, and time-saving tools. Ramp has experienced significant growth in recent years, with a large increase in accounts payable spend, purchase volume, revenue, and customer base. It has also launched a corporate travel tool and announced the forthcoming launch of an AI agent for customer education.'