In [2]:
!pip -q install langchain openai tiktoken PyPDF2 faiss-cpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m76.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [18]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

import pandas as pd
import os

#### Set up Env

In [10]:
# Set the OpenAI API Key
os.environ["OPENAI_API_KEY"] = "<YOUR-API-KEY>"

In [6]:
# Get the text as a single string
df = pd.read_csv('/content/sugarwork_production_public_meetings.csv')
conversation = df.iloc[0,1]

# Checking the first 1000 characters
conversation[:1000]

"nicole: Hello.\nAdeesha Ekanayake-Weber: Good. How are you? I can hear myself. Can you hear me. I can hear you, but can I still hear me? No, I can't hear myself anymore. Cool. I am well. Good.\nnicole: I message you the topics that I want to make sure that we get through. Did you put them in? I actually don't have prod open right now. I did. And in fact, I made sure we\nAdeesha Ekanayake-Weber: do it right. Because. We should be experimenting. We should use this to experiment with new ways of summarizing. So here it is all the questions. So I think we can also do is we can have a kind of summary. Where. We organize all of the things that we discussed. Or I guess all of the items we discussed, we should organize them under the questions. And I think that would be one good way of summarizing, don't you think? Yeah, that sounds great.\nnicole: I want to just log in prod. What is the organization name again? Is it just sugar, work.\nAdeesha Ekanayake-Weber: Sugarwork hyphen SSO but you sh

#### Chunk the text

In [7]:
# Splitting up the text into smaller chunks for indexing
text_splitter = CharacterTextSplitter(
    separator = ".",
    chunk_size = 1000,
    chunk_overlap  = 500, #striding over the text
    length_function = len,
)
texts = text_splitter.split_text(conversation)

#### Making Embeddings

In [11]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [24]:
docsearch = FAISS.from_texts(texts, embeddings)

#### Setup the Chain

In [19]:
# Change the prompt
prompt_template = """Use the following pieces of context to answer the question at the end.
The contexts are based on a conversation of a knowledge transfer between two colleagues.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [20]:
# Sample queries to check

query = "What is the organization name"
# query = "Why do we need a docker file?"
# query = "how to find the url for the database?"


In [21]:
# Get similar documents from the FAISS
docs = docsearch.similarity_search(query,k=4)

In [22]:
# Get results
chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff", prompt=PROMPT)

result = chain({"input_documents": docs, "question": query}, return_only_outputs=False)

result['output_text']

' The organization name is Sugarwork hyphen SSO.'

In [23]:
# Text chunks that the llm used to output the above text
result['input_documents']

[Document(page_content="We should use this to experiment with new ways of summarizing. So here it is all the questions. So I think we can also do is we can have a kind of summary. Where. We organize all of the things that we discussed. Or I guess all of the items we discussed, we should organize them under the questions. And I think that would be one good way of summarizing, don't you think? Yeah, that sounds great.\nnicole: I want to just log in prod. What is the organization name again? Is it just sugar, work.\nAdeesha Ekanayake-Weber: Sugarwork hyphen SSO but you should be able to log. In through the SSO link that you get from email, but yeah, you can just type in sugar work if an SSO. Continue.\nnicole: Microsoft. Last time I had everything logged in. It was on my phone. And so I'm not because I took that call when we did the recording last time. Wait. It didn't log me in.\nAdeesha Ekanayake-Weber: What do you see. Yeah, let me check. Do you have an sso user? I thought\nnicole: we 

Future Experiments:


1.   Preprocess the data
2.   Hyperparameter tuning for OpenAI
3.   Add Memory to QAChain in order to continue conversation
4.   Use open source LLM (Llama 2)
5.   Hosting in AWS sagemaker endpoint
6.     
