In [14]:
import os
import numpy as np 
import pandas as pd 
from langchain import HuggingFaceHub
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import CharacterTextSplitter
os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'

repo_id =  "tiiuae/falcon-7b" # "medmac01/moroccan-qa-falcon-7b-v3"
excel_path = "../data/Data_Mortgage.xlsx"

In [15]:
llm = HuggingFaceHub(huggingfacehub_api_token=os.getenv('HUGGINGFACEHUB_API_TOKEN'),
                     repo_id=repo_id, 
                     model_kwargs={"temperature":0.6, "max_new_tokens":250})

In [16]:

excel_data = pd.read_excel(excel_path)
excel_data.dropna(subset='Opportunity',inplace=True)
excel_data.head()

Unnamed: 0,Date,Company / Account,Opportunity,Unnamed: 3,Lead,Assigned,Priority,Status,Task,Ameyo Recording URL,Call Type,CallDurationInSeconds
1,4/4/2023,Mohammed Jaffer,Mohammed Jaffer,Mohammed Jaffer,,Yaseen Syed Ali,Low,Completed,True,https://prypto-api.aswat.co/surveillance/recor...,Outbound,422
2,4/4/2023,G Abbas,G Abbas,G Abbas,,Yaseen Syed Ali,Low,Completed,True,https://prypto-api.aswat.co/surveillance/recor...,Outbound,237
6,4/4/2023,Ahsan Khan,Ahsan Khan,Ahsan Khan,,Yaseen Syed Ali,Low,Completed,True,https://prypto-api.aswat.co/surveillance/recor...,Outbound,74
11,4/5/2023,Fayiqa Iftikhar,Fayiqa Iftikhar,Fayiqa Iftikhar,,Yaseen Syed Ali,Low,Completed,True,https://prypto-api.aswat.co/surveillance/recor...,Outbound,481
13,4/5/2023,Smith Suresh Shetty,Smith Suresh Shetty,Smith Suresh Shetty,,Yaseen Syed Ali,Low,Completed,True,https://prypto-api.aswat.co/surveillance/recor...,Outbound,269


### Data Loader 

In [17]:

def data_loader(tmp_path, chunk_size=1000, overlap=0):
    def metadata_func(record: dict, metadata: dict) -> dict:
        metadata['customer'] = record.get('customer')
        metadata['language'] = record.get('language')
        metadata['duration'] = record.get('call duration')
        return metadata

    loader = JSONLoader(
        file_path=tmp_path,
        jq_schema='.data[]',
        content_key="text",
        metadata_func=metadata_func
    )
    conversation_docs = loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size,
                                            chunk_overlap=overlap)
    texts = text_splitter.split_documents(documents=conversation_docs)
    return texts

doc = data_loader(tmp_path="../data/Processed_data/Audio_data.json",overlap=50)


### Creating Vector Embeddings

In [18]:
from sentence_transformers import SentenceTransformer
from langchain.embeddings.base import Embeddings
from typing import List

class HuggingFaceEmbeddings(Embeddings):
    def __init__(self, model_id='multi-qa-mpnet-base-dot-v1'):
          # Should use the GPU by default
        self.model = SentenceTransformer(model_id)
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed a list of documents using a locally running
           Hugging Face Sentence Transformer model
        Args:
            texts: The list of texts to embed.
        Returns:
            List of embeddings, one for each text.
        """
        embeddings = self.model.encode(texts)
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        """Embed a query using a locally running HF
        Sentence trnsformer.
        Args:
            text: The text to embed.
        Returns:
            Embeddings for the text.
        """
        embedding = self.model.encode(text)
        return list(map(float, embedding))
    
def save_to_local_vectorstore(docs, embedding):
    vectorstore = None
    try:
        from langchain.vectorstores import FAISS
        vectorstore = FAISS.from_documents(documents=docs, embedding=embedding, )
    except ImportError as err:
        raise ("{} no module FAISS found. use pip install faiss".format(err))
    return vectorstore
    
huggingface_embeddings = HuggingFaceEmbeddings()

#### Save Vector Embeddings in FAISS 

In [19]:
from langchain.vectorstores import FAISS

if not os.path.isdir("../data/faiss_dmac_gpt_falcon'"):
    vectorstore =  FAISS.from_documents(documents=doc,
                                    embedding = huggingface_embeddings
                                    ) # turn dcos into Vectors and store them in RAM also add metadata 
    vectorstore.save_local('../data/faiss_dmac_gpt_falcon')

### Prompt 

In [33]:
from langchain.prompts import PromptTemplate
def create_prompt():
    prompt_template = """
    Analyze conversations between customer and sales executive from context.
    If customer shows interest in service or Property , conversation is a potential lead for business.  
    Always answer point wise with person names. Don't make up answers
   
    {context}
   
    Question: {question}
    Answer stepwise: 
    """
    prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
    return prompt


#### Load Embeddings 

In [34]:
# memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
vectordb = FAISS.load_local('../data/faiss_dmac_gpt_falcon/',embeddings=huggingface_embeddings)

### Query With Falcon Model

In [35]:
from langchain.chains import LLMChain
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import ConversationalRetrievalChain,RetrievalQA

question_generator = LLMChain(llm=llm, prompt=create_prompt())
doc_chain = load_qa_chain(llm,chain_type='map_reduce')


In [53]:
chain = RetrievalQA.from_chain_type(
                                    llm = llm,
                                    retriever=vectordb.as_retriever(),
                                    # question_generator=question_generator,
                                    chain_type_kwargs={"prompt": create_prompt()},
                                    )
chat_history = []

### QUestion & Answer 

In [56]:
query = "summarize provided conversations ? "
result = chain({"query": query})

print("query:{}".format(result['query']))
print("response:\n{}".format(result['result']))

query:summarize provided conversations ? 
response:
 [email, Can I want. And he. Can you want. Yes, [email] [email]? Right. And go. The email. At. Okay.
# # I don's. So, You. Send. Send. Send. (At the. This is it. Email. A Email. Email. Email. If you can write to email. I's the details, So. You email. It's. So, So. I's. So instead of. The rest of the of the same. You have you's. Is it. You can have you can. Please. Your name? 4- You's and. Please. I's. Thank you. The name. The other one. You can I's. Please give us. The name my name. So, you's.





In [57]:
query = "customers who are potential lead ? "
result = chain({"query": query, })

print("query:{}".format(result['query']))
print("response:\n{}".format(result['result']))
# chat_history.append((query,result['answer']))

query:customers who are potential lead ? 
response:
 Hi. Talk to the bank. Hi. Have you have you are not. I's. Sir. Hi. Can you. . Good morning. Hi. I's and can. You would you have a mortgage, how is it. You. Or to you is it. I's. I am. Okay. I's. And you have you are mortgage you's. Please. Can I have you are you? And I's. I's. And I's you are you't 200, and. I's. I's the.

This. Thank you. And I's. As I's. You's. Sir. Okay. Sir. And you's. And you. I's. Okay. Sir. You. And then. Or the house. You's. Sir. Your name? I's two names. And you. Come and on the key. And your leave the bank. I need you are you. The same. You's The. I need to the. Please. So the other , and I need to them. The time. I and I's. Okay. Meant


### Not working well , we might need to fine tune the model on the provided data 