In [1]:
import os
import numpy as np 
import pandas as pd 
from langchain import HuggingFaceHub
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import CharacterTextSplitter
os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'

repo_id = "tiiuae/falcon-7b"
excel_path = "../data/Data_Mortgage.xlsx"

In [2]:
llm = HuggingFaceHub(huggingfacehub_api_token=os.getenv('HUGGINGFACEHUB_API_TOKEN'),
                     repo_id=repo_id, 
                     model_kwargs={"temperature":0.6, "max_new_tokens":500})


In [None]:

excel_data = pd.read_excel(excel_path)
excel_data.dropna(subset='Opportunity',inplace=True)
excel_data.head()

### Data Loader 

In [None]:

def data_loader(tmp_path, chunk_size=1000, overlap=0):
    def metadata_func(record: dict, metadata: dict) -> dict:
        metadata['customer'] = record.get('customer')
        metadata['language'] = record.get('language')
        metadata['duration'] = record.get('call duration')
        return metadata

    loader = JSONLoader(
        file_path=tmp_path,
        jq_schema='.data[]',
        content_key="text",
        metadata_func=metadata_func
    )
    conversation_docs = loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size,
                                            chunk_overlap=overlap)
    texts = text_splitter.split_documents(documents=conversation_docs)
    return texts

doc = data_loader(tmp_path="../data/Processed_data/Audio_data.json",overlap=50)


### Creating Vector Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
from langchain.embeddings.base import Embeddings
from typing import List

class HuggingFaceEmbeddings(Embeddings):
    def __init__(self, model_id='multi-qa-mpnet-base-dot-v1'):
          # Should use the GPU by default
        self.model = SentenceTransformer(model_id)
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed a list of documents using a locally running
           Hugging Face Sentence Transformer model
        Args:
            texts: The list of texts to embed.
        Returns:
            List of embeddings, one for each text.
        """
        embeddings = self.model.encode(texts)
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        """Embed a query using a locally running HF
        Sentence trnsformer.
        Args:
            text: The text to embed.
        Returns:
            Embeddings for the text.
        """
        embedding = self.model.encode(text)
        return list(map(float, embedding))
    
def save_to_local_vectorstore(docs, embedding):
    vectorstore = None
    try:
        from langchain.vectorstores import FAISS
        vectorstore = FAISS.from_documents(documents=docs, embedding=embedding, )
    except ImportError as err:
        raise ("{} no module FAISS found. use pip install faiss".format(err))
    return vectorstore
    
huggingface_embeddings = HuggingFaceEmbeddings(model_id='all-mpnet-base-v2')

#### Save Vector Embeddings in FAISS 

In [None]:
from langchain.vectorstores import FAISS

vectorstore =  FAISS.from_documents(documents=doc,
                                    embedding = huggingface_embeddings
                                    ) # turn dcos into Vectors and store them in RAM also add metadata 
vectorstore.save_local('../data/faiss_dmac_gpt_falcon')

### Prompt 

In [None]:
from langchain.prompts import PromptTemplate
def create_prompt():
    prompt_template = """
    Analyze conversations between customer and sales executive from context.
    If customer shows interest in service or Property , conversation is a potential lead for business.  
    Always answer point wise with person names. Don't make up answers
   
    {context}
   
    {chat_history}
   
    Question: {question}
    Answer stepwise: 
    """
    prompt = PromptTemplate(input_variables=["context", "question", "chat_history"], template=prompt_template)
    return prompt


#### Load Embeddings 

In [None]:
# memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
vectordb = FAISS.load_local('../data/faiss_dmac_gpt_falcon/',embeddings=huggingface_embeddings)

### Query With Falcon Model

In [None]:
from langchain.chains import LLMChain
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import ConversationalRetrievalChain

question_generator = LLMChain(llm=llm, prompt=create_prompt())
doc_chain = load_qa_chain(llm,chain_type='map_reduce')


In [None]:
chain = ConversationalRetrievalChain.from_llm(
                                    llm = llm,
                                    retriever=vectordb.as_retriever(),
                                    # question_generator=question_generator,
                                    combine_docs_chain_kwargs={"prompt": create_prompt()},
                                    )
chat_history = []

### QUestion & Answer 

In [None]:
query = "Summarize conversations  ? "
result = chain({"question": query, "chat_history": chat_history})

print("query:{}".format(result['question']))
print("response:\n{}".format(result['answer']))
chat_history.append((query,result['answer']))

In [None]:
query = "Names of customers who can be potential lead ? "
result = chain({"question": query, "chat_history": chat_history})

print("query:{}".format(result['question']))
print("response:\n{}".format(result['answer']))
chat_history.append((query,result['answer']))