# Chat with coffee review text data

In [4]:
import os
import openai
import sys
import nltk
from nltk.corpus import stopwords
import numpy as np
sw2 = stopwords.words("english")
from collections import Counter
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_= load_dotenv(find_dotenv())

openai.api_key = "openai_api_key"

In [5]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader

In [6]:
# make a pdf loader and load the `coffee_review` file. 
loader = PyPDFLoader ("coffee_review.PDF")
pages = loader.load()
len(pages)

105

In [7]:
# make a function `get_patterns` to get the total tokens, unique tokens, average token length, lexical diversity, and the most
# common words. 
def get_patterns(corpus):

    all_desc = corpus
    all_str = " ".join(all_desc)
    clean = [w for w in all_str.split() if w.lower() not in sw2]
    
    total_tokens = len(clean)
    unique_tokens = len(set(clean))
    clean_tok_len = [len(w) for w in clean]
    avg_token_len = np.mean(clean_tok_len)
    lex_diversity = len(set(clean))/len(clean)
    top_n = Counter(clean).most_common(20)
    
    results = {'tokens':total_tokens,
               'unique_tokens':unique_tokens,
               'avg_token_length':round(avg_token_len,2),
               'lexical_diversity':round(lex_diversity,2),
               'Top_n':top_n}

    return(results)

In [8]:
# get each of the words from the PDF.
words=[]
for i in range(len(pages)):
    content=pages[i].page_content
    words=words+content.split()

In [9]:
# get the results from the function `get_patterns`. 
get_patterns(words)

{'tokens': 41064,
 'unique_tokens': 3070,
 'avg_token_length': 6.45,
 'lexical_diversity': 0.07,
 'Top_n': [('cup.', 1240),
  ('aroma', 1230),
  ('mouthfeel.', 1072),
  ('acidity;', 969),
  ('structure', 903),
  ('finish', 839),
  ('notes', 613),
  ('sweet', 581),
  ('cocoa', 568),
  ('Sweet', 476),
  ('chocolate,', 468),
  ('syrupy', 451),
  ('-toned', 441),
  ('-toned.', 352),
  ('-toned,', 337),
  ('nib,', 327),
  ('chocolate', 325),
  ('fruit', 322),
  ('dark', 307),
  ('Sweetly', 280)]}

In [10]:
# Use a default version of llm "gpt-3.5-turbo".

llm_name = "gpt-3.5-turbo"
print(llm_name)

gpt-3.5-turbo


In [11]:
# define a function `load_db`.
def load_db(file, chain_type, k):
    # load documents
    loader = PyPDFLoader(file)
    documents = loader.load()
    # split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)
    # define embedding
    embeddings = OpenAIEmbeddings(openai_api_key = 'openai_api_key')
    # create vector database from data
    db = DocArrayInMemorySearch.from_documents(docs, embeddings)
    # define retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
    # create a chatbot chain. Memory is managed externally.
    qa = ConversationalRetrievalChain.from_llm(
        llm=ChatOpenAI(model_name=llm_name, temperature=0, openai_api_key = 'openai_api_key'), 
        chain_type=chain_type, 
        retriever=retriever, 
        return_source_documents=True,
        return_generated_question=True,
    )
    return qa 


In [12]:
import panel as pn
import param

# this is a class cbfs definition with four parameters. 
class cbfs(param.Parameterized):
    chat_history = param.List([]) # list all the chat history
    answer = param.String("") # hold the answer to the question
    db_query  = param.String("") # the query to send to the database
    db_response = param.List([]) # the response 
    
    # the function constructor (__init__) initializes the instance of the class
    def __init__(self,  **params):
        # call constructor of the parent class super
        super(cbfs, self).__init__( **params)
        # set the panels an empty list
        self.panels = []
        # load the file
        self.loaded_file = "coffee_review.pdf"
        self.qa = load_db(self.loaded_file,"stuff", 4)
    
    # load a new pdf. It checks if the count is zero or if no file is specified. If true, it returns a Markdown message indicating
    # the loaded file. Otherwise, it saves the file locally, updates the loaded file attribute, and reloads the database. 
    def call_load_db(self, count):
        if count == 0 or file_input.value is None:  # init or no file specified :
            return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")
        else:
            file_input.save("temp.pdf")  # local copy
            self.loaded_file = file_input.filename # file_input.filename is the name of new file we are loading 
            button_load.button_style="outline" # the style of button load is changed to `outline`
            self.qa = load_db("temp.pdf", "stuff", 4) # reload the database
            button_load.button_style="solid" # the style of button changes to `solid` when finishing loading
        self.clr_history()
        return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")

    # display a conversation chain. It takes a user's query, interacts with a question-answering system(`qa`),updates the chat
    # history, and generates responses. The conversation history is stored in the `panels` list. it returns a WidgetBox containing
    # Panel objects representing the conversatio history. 
    def convchain(self, query):
        # if the user did not provide the query then return an empty markdown. 
        if not query:
            return pn.WidgetBox(pn.Row('User:', pn.pane.Markdown("", width=600)), scroll=True) 
        # get the result from the langauge model based on the query that the user gives us. 
        result = self.qa({"question": query, "chat_history": self.chat_history}) 
        # add the query and result to the chat history
        self.chat_history.extend([(query, result["answer"])])
        # set the database query response based on the result the model gives us
        self.db_query = result["generated_question"]
        self.db_response = result["source_documents"]
        # set the answer for the query
        self.answer = result['answer'] 
        # update the interactions between user and chatbot to have the last query and the answer for that query. 
        self.panels.extend([
            pn.Row('User:', pn.pane.Markdown(query, width=600)),
            pn.Row('ChatBot:', pn.pane.Markdown(self.answer, width=600, style={'background-color': '#F6F6F6'}))
        ])
        inp.value = ''  # clears loading indicator  
        return pn.WidgetBox(*self.panels,scroll=True) # return the panels we made

    # the following code chunk uses the `@param.depends` decorator. 
    # the function `get_lquest` is decorated with `@param.depends` and depends on changes to the `db_query` attribute. It returns a Panel
    # layout based on whether `db_query` is empty or not. If it is, it displays a message indicating there is no database
    # accesses so far. Otherwise, it shows the last database query. 
    @param.depends('db_query ', )
    def get_lquest(self):
        if not self.db_query :
            return pn.Column(
                pn.Row(pn.pane.Markdown(f"Last question to DB:", styles={'background-color': '#F6F6F6'})),
                pn.Row(pn.pane.Str("no DB accesses so far"))
            )
        return pn.Column(
            pn.Row(pn.pane.Markdown(f"DB query:", styles={'background-color': '#F6F6F6'})),
            pn.pane.Str(self.db_query )
        )
    
    # the function `get_sources` depends on changes to the `db-response` attribute. If it is not empty, it creates a list of Panel
    # rows, each containing a document from database response. The list then is wrapped in a WidgetBox. If it is, it returns `none`. 
    # get the last response from the database
    @param.depends('db_response', )
    def get_sources(self):
        if not self.db_response:
            return 
        rlist=[pn.Row(pn.pane.Markdown(f"Result of DB lookup:", styles={'background-color': '#F6F6F6'}))]
        for doc in self.db_response:
            rlist.append(pn.Row(pn.pane.Str(doc)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)
    
    # the function `get_chats` depends on changes to the `convchain` and `clr_history` attributes. It shows the current chat
    # history available. if `chat_history` is empty, it shows a message saying there is no chat history. Otherwise it creats a 
    # list of Panel rows, each has an exchange from the chat history. The list is then wrapped in a WidgetBox for display. 
    @param.depends('convchain', 'clr_history') 
    def get_chats(self):
        if not self.chat_history:
            return pn.WidgetBox(pn.Row(pn.pane.Str("No History Yet")), width=600, scroll=True)
        rlist=[pn.Row(pn.pane.Markdown(f"Current Chat History variable", styles={'background-color': '#F6F6F6'}))]
        for exchange in self.chat_history:
            rlist.append(pn.Row(pn.pane.Str(exchange)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)
    
    # the function `clr_history` clears chat history. 
    def clr_history(self,count=0):
        self.chat_history = []
        return 


In [13]:
# this code chunk creates a panel-based interactive dashboard for a chatbot application. It sets up a dashboard with four 
# tabs:"Conversation,""Database,""Chat History,"and "Configure". Each tab contains different components realted to user input, 
# conversation history, database queries, and configuration options. 

pn.extension()

# create an instance of cbfs
cb = cbfs()

# create Panel widgets
file_input = pn.widgets.FileInput(accept='.pdf') # make a widget to allow us to select a file
button_load = pn.widgets.Button(name="Load DB", button_type='primary') # make a button to load the database
button_clearhistory = pn.widgets.Button(name="Clear History", button_type='warning') # make the button
button_clearhistory.on_click(cb.clr_history) # add the function to clear the history 
inp = pn.widgets.TextInput( placeholder='Enter text here…') # make a textinput to type a question

# bind widgets
bound_button_load = pn.bind(cb.call_load_db, button_load.param.clicks) # make the button load. call the load_db function. 
conversation = pn.bind(cb.convchain, inp) 
# create an image Pane
jpg_pane = pn.pane.Image( './img/convchain.jpg')

# create dashboard tabs
# the first tab is the conversation tab
tab1 = pn.Column(
    pn.Row(inp),
    pn.layout.Divider(),
    pn.panel(conversation,  loading_indicator=True, height=300),
    pn.layout.Divider(),
)
# the second tab has two panels: get the last question to DB and DB access history 
tab2= pn.Column(
    pn.panel(cb.get_lquest),
    pn.layout.Divider(),
    pn.panel(cb.get_sources ),
)
# the third tab shows the chat history
tab3= pn.Column(
    pn.panel(cb.get_chats),
    pn.layout.Divider(),
)
# the fourth tab is for configuring the database
tab4=pn.Column(
    pn.Row( file_input, button_load, bound_button_load),
    pn.Row( button_clearhistory, pn.pane.Markdown("Clears chat history. Can use to start a new topic" )),
    pn.layout.Divider(),
    pn.Row(jpg_pane.clone(width=400))
)
# create the main dashboard
dashboard = pn.Column(
    pn.Row(pn.pane.Markdown('# ChatWithYourData_Bot')),
    pn.Tabs(('Conversation', tab1), ('Database', tab2), ('Chat History', tab3),('Configure', tab4))
)
dashboard