In [None]:
# Core libraries
!pip install langchain
!pip install langchain-core
!pip install sentence-transformers
!pip install tiktoken
! pip install -U google-generativeai

# Large language models and vector stores
!pip install openai  # Upgrade later if needed
!pip install groq  # If using Groq for acceleration
!pip install pinecone-client
!pip install chromadb

# Langchain extensions and integrations
!pip install langchain-community
!pip install langchain-pinecone
!pip install langchain-groq  # If using Groq

# Additional dependencies
!pip install huggingface_hub

# Upgrades
!pip install -U langchain-community
!pip install --upgrade openai
!pip install --upgrade langchain-pinecone
!pip install -qU langchain-groq  # Quiet upgrade





In [None]:
from google.colab import userdata
KEY_groq = userdata.get('GROQ_API_KEY')
KEY_openai = userdata.get('OPENAI_API_KEY')
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')

In [None]:
import os
import json
import datetime
import pandas as pd
import warnings

# NLP Libraries
import transformers
import torch

import os
import google.generativeai as genai

# Langchain Core
from langchain.schema import HumanMessage, AIMessage, ChatMessage
from langchain_core.tools import tool
from langchain_core.pydantic_v1 import BaseModel, Field

# Langchain LLMs and Agents
from langchain.llms import OpenAI, HuggingFacePipeline, CTransformers
from langchain.chat_models import ChatOpenAI
from langchain.agents import AgentType, load_tools, initialize_agent

# Langchain Chains and Prompts
from langchain.chains import LLMChain, SequentialChain, RetrievalQA, ConversationChain
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
from langchain_core.messages import SystemMessage

# Langchain Memory
from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory

# Langchain Document Processing
from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Langchain Embeddings and Vector Stores
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.vectorstores import Pinecone

# Groq Integration (if used)
from groq import Groq
from langchain_groq import ChatGroq

# External Libraries
import pinecone

In [None]:
transcript_df = pd.read_csv('/content/drive/MyDrive/DataSets/Earnings_21.csv')

In [None]:
transcript_df.head(11)

Unnamed: 0,file_id,audio_length,sample_rate,company_name,financial_quarter,sector,speaker_switches,unique_speakers,curator_id,transcription
0,4320211,3285.848,24000,Monro Inc,3,Consumer Goods,82,10,1,"Good morning ladies and gentlemen, and welcome..."
1,4330115,2458.904,24000,Culp Inc,3,Industrial Goods,43,8,1,Good day and welcome to Culp's third quarter 2...
2,4341191,5740.64,24000,General Electric,1,Conglomerate,147,14,1,Good morning and welcome to the first quarter ...
3,4344338,2721.169,44100,Danaher Corp,1,Conglomerate,51,7,1,My name is Christelle and I will be your confe...
4,4344866,3275.456,24000,Spire Inc,2,Utilities,82,10,8,"Good morning, and welcome to the Spire Second ..."
5,4346818,3972.022,11025,Ingersoll Rand,1,Industrial Goods,99,14,0,Ladies and gentlemen thank you for standing by...
6,4346923,4709.418,16000,Cementos Argos,1,Industrial Goods,120,20,1,"Hello gentlemen, gent- ladies and gentlemen, a..."
7,4359732,4887.498,44100,Kuehne Nagel International,2,Services,114,13,9,"Ladies and gentlemen, welcome to Kuehne + Nage..."
8,4359971,3759.944,24000,Constellium,2,Industrial Goods,116,10,8,"Ladies and gentlemen, thank you for standing b..."
9,4360366,3906.752,24000,Travelers Companies Inc,2,Financial,104,15,8,"Good morning, ladies and gentlemen. Welcome to..."


In [None]:
summary_df = pd.read_csv('/content/drive/MyDrive/DataSets/Earnings_21_summarized_transcriptions.csv')

In [None]:
summary_df

Unnamed: 0,file_id,summary
0,4320211,Monro Inc. held its third-quarter earnings con...
1,4330115,"Culp, Inc. hosted a conference call to discuss..."
2,4341191,Given the extensive nature of the text provide...
3,4344338,Danaher Corporation's first quarter 2020 earni...
4,4344866,The Spire Second Quarter Earnings call began w...
5,4346818,The conference call transcript is from Ingerso...
6,4346923,The conference call discussed the first quarte...
7,4359732,Here's a 400-500 word summary of Kuehne + Nage...
8,4359971,Constellium reported its second-quarter earnin...
9,4360366,"The Travelers Companies, Inc. held a conferenc..."


In [None]:
# Load the DataFrame from the pickle file
chunks_df = pd.read_pickle('/content/drive/MyDrive/DataSets/Earnings_21_chunks_with_embeddings.pkl')

In [None]:
chunks_df.head()

Unnamed: 0,file_id,chunk,embedding
0,4320211,Monro Inc. held its third-quarter earnings con...,"[0.030996237, 0.007431771, -0.00041755062, 0.0..."
1,4320211,The company's store rebranding and re-imaging ...,"[-0.031461723, -0.025675885, 0.028074343, -0.0..."
2,4320211,Monro Inc. has also made significant progress ...,"[0.050692648, -0.07287536, -0.02692151, -0.042..."
3,4320211,The company's guidance for fiscal 2020 has bee...,"[-0.009417127, 0.008365667, 0.057727903, 0.065..."
4,4320211,"The company's CEO, Brett Ponton, stated that t...","[-0.062053658, -0.005355529, 0.004473756, 0.01..."


In [None]:
from sentence_transformers import SentenceTransformer
# sentences = ["This is an example sentence", "Each sentence is converted"]

model_embedding = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# embeddings = model_embedding.encode(sentences)
# print(embeddings)


In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("earningscall")

In [None]:
import pinecone
from sentence_transformers import SentenceTransformer


def query_pinecone(query_text, top_k=5):
    # Generate the embedding for the query text
    query_embedding = model_embedding.encode(query_text).tolist()

    # Query the index
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

    return results

# Example usage
query = "what did Monro Inc. reported  due to mild winter weather conditions"
results = query_pinecone(query)

# Print results
for match in results['matches']:
    print(f"Score: {match['score']:.2f}")
    print(f"Text: {match['metadata']}")
    print("Other Metadata:", {k:v for k,v in match['metadata'].items() if k != 'text'})
    print("---")

Score: 0.70
Text: {'text': 'Monro Inc. held its third-quarter earnings conference call for fiscal 2020. The company reported a 0.9% decline in comparable store sales, primarily due to mild winter weather conditions in the Northeast and Midwest. Despite this, the company remains confident in its Monro Forward strategy, which includes store rebranding and re-imaging, technological investments, and optimization of its tire category management.'}
Other Metadata: {}
---
Score: 0.62
Text: {'text': "* Monro Inc. reported a 0.9% decline in comparable store sales due to mild winter weather conditions.\n* The company's store rebranding and re-imaging initiative has shown promising results, with rebranded stores experiencing an 18% increase in sales.\n* The company has made significant progress in its technological investments, including the implementation of a new digital phone and texting system and the development of a cloud-based store staffing and scheduling model."}
Other Metadata: {}
---
S

In [None]:
# model_Llama = "llama-3.1-70b-versatile"
# model="llama-3.1-8b-instant"
# # model_Llama="llama3-70b-8192",
# groq_chat = ChatGroq(groq_api_key=KEY_groq, model_name=model,temperature=0.25)
# llm = groq_chat

In [None]:
model_Llama = "llama-3.1-70b-versatile"
# model_Llama="llama3-70b-8192",
model="llama-3.1-8b-instant"
groq_chat_1 = ChatGroq(groq_api_key=KEY_groq, model_name=model,temperature=0.25)
llm_1 = groq_chat_1

In [None]:
import pinecone
from sentence_transformers import SentenceTransformer
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import BaseRetriever
from langchain.docstore.document import Document
from typing import List

retrieved_docs_list = []  # To store retrieved documents for each query
predictions = []  # To store final answers provided by the system

def query_pinecone(query_text, top_k=5):
    # Generate the embedding for the query text
    query_embedding = model_embedding.encode(query_text).tolist()

    # Query the index
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

    return results

class CustomPineconeRetriever(BaseRetriever):
    def get_relevant_documents(self, query: str) -> List[Document]:
        results = query_pinecone(query)
        docs = []
        for match in results['matches']:
            metadata = match['metadata']
            text = metadata.pop('text', '')  # Remove 'text' from metadata and use it as the main content
            docs.append(Document(page_content=text, metadata=metadata))
        return docs

    async def aget_relevant_documents(self, query: str) -> List[Document]:
        return self.get_relevant_documents(query)

# Initialize the language model
llm = groq_chat_1

# Set up a custom prompt template
prompt_template="""
Based on the provided information, please answer the user's question accurately. If the information is insufficient or the answer is unknown, simply respond with "I don't know."

Context: {context}
Question: {question}

Provide a clear and helpful answer below:
Answer:
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

# Create the custom retriever
custom_retriever = CustomPineconeRetriever()

# Set up the RetrievalQA chain with the custom retriever
qa = RetrievalQA.from_chain_type(
    llm=groq_chat_1,
    chain_type="stuff",
    retriever=custom_retriever,
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

# Interactive QA loop
while True:
    user_input = input("Ask a question (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break

    result = qa({"query": user_input})

    # Store retrieved documents
    source_docs = result["source_documents"]
    retrieved_docs = [{"text": doc.page_content, "metadata": doc.metadata} for doc in source_docs]
    retrieved_docs_list.append(retrieved_docs)

    # Store the prediction
    predictions.append(result["result"])

    # Print the response and source documents
    print("\nResponse:", result["result"])
    print("\nSource Documents:")
    for doc in source_docs:
        print(f"- {doc.page_content[:400]}...")  # Print first 100 chars of each source

    print("\n" + "-"*50 + "\n")

# After the QA loop, you can evaluate the system using retrieved_docs_list and predictions.


Ask a question (or type 'exit' to quit): what did monro incs ceo and cfo stated


  warn_deprecated(



Response: Monro Inc.'s CEO, Brett Ponton, stated that the company is committed to driving the necessary changes to improve its business and is confident in its ability to execute its strategy.

Monro Inc.'s CFO, Brian D'Ambrosia, noted that the company's guidance assumes relatively stable overall tire and oil costs for the balance of fiscal 2020.

Source Documents:
- Monro Inc. held its third-quarter earnings conference call for fiscal 2020. The company reported a 0.9% decline in comparable store sales, primarily due to mild winter weather conditions in the Northeast and Midwest. Despite this, the company remains confident in its Monro Forward strategy, which includes store rebranding and re-imaging, technological investments, and optimization of its tire cate...
- Monro Inc. has also made significant progress in its acquisitions, with the company announcing the acquisition of three companies with a total of 23 locations. The company expects these acquisitions to add approximately $45

ServiceException: (504)
Reason: Gateway Time-out
HTTP response headers: HTTPHeaderDict({'Server': 'awselb/2.0', 'Date': 'Wed, 21 Aug 2024 17:09:29 GMT', 'Content-Type': 'text/html', 'Content-Length': '132', 'Connection': 'keep-alive'})
HTTP response body: <html>
<head><title>504 Gateway Time-out</title></head>
<body>
<center><h1>504 Gateway Time-out</h1></center>
</body>
</html>



In [None]:
# import pinecone
# from sentence_transformers import SentenceTransformer
# from langchain.llms import OpenAI
# from langchain.chains import RetrievalQA
# from langchain.prompts import PromptTemplate
# from langchain.schema import BaseRetriever
# from langchain.docstore.document import Document
# from typing import List

# # Initialize the embedding model
# model_embedding = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# # Initialize Pinecone (make sure Pinecone is properly initialized)
# index = pinecone.Index('your-index-name')

# def query_pinecone(query_text, top_k=5):
#     # Generate the embedding for the query text
#     query_embedding = model_embedding.encode(query_text).tolist()

#     # Query the index
#     results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

#     return results

# class CustomPineconeRetriever(BaseRetriever):
#     def get_relevant_documents(self, query: str) -> List[Document]:
#         results = query_pinecone(query)
#         docs = []
#         for match in results['matches']:
#             metadata = match['metadata']
#             text = metadata.pop('text', '')  # Remove 'text' from metadata and use it as the main content
#             docs.append(Document(page_content=text, metadata=metadata))
#         return docs

#     async def aget_relevant_documents(self, query: str) -> List[Document]:
#         return self.get_relevant_documents(query)

# # Initialize the language model
# llm = OpenAI()

# # Set up a custom prompt template
# prompt_template="""
# Based on the provided information, please answer the user's question accurately. If the information is insufficient or the answer is unknown, simply respond with "I don't know."

# Context: {context}
# Question: {question}

# Provide a clear and helpful answer below:
# Answer:
# """
# PROMPT = PromptTemplate(
#     template=prompt_template, input_variables=["context", "question"]
# )

# chain_type_kwargs = {"prompt": PROMPT}

# # Create the custom retriever
# custom_retriever = CustomPineconeRetriever()

# # Set up the RetrievalQA chain with the custom retriever
# qa = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=custom_retriever,
#     return_source_documents=True,
#     chain_type_kwargs=chain_type_kwargs
# )

# # Initialize lists to store retrieved docs and predictions
# retrieved_docs_list = []
# predictions = []

# # Interactive QA loop
# while True:
#     user_input = input("Ask a question (or type 'exit' to quit): ")
#     if user_input.lower() == 'exit':
#         break

#     result = qa({"query": user_input})

#     # Store retrieved documents
#     source_docs = result["source_documents"]
#     retrieved_docs = [{"text": doc.page_content, "metadata": doc.metadata} for doc in source_docs]
#     retrieved_docs_list.append(retrieved_docs)

#     # Store the prediction
#     predictions.append(result["result"])

#     # Print the response and source documents
#     print("\nResponse:", result["result"])
#     print("\nSource Documents:")
#     for doc in source_docs:
#         print(f"- {doc.page_content[:100]}...")  # Print first 100 chars of each source

#     print("\n" + "-"*50 + "\n")

# # After the QA loop, you can evaluate the system using retrieved_docs_list and predictions.


In [None]:
retrieved_docs_list

[[{'text': 'Monro Inc. held its third-quarter earnings conference call for fiscal 2020. The company reported a 0.9% decline in comparable store sales, primarily due to mild winter weather conditions in the Northeast and Midwest. Despite this, the company remains confident in its Monro Forward strategy, which includes store rebranding and re-imaging, technological investments, and optimization of its tire category management.',
   'metadata': {}},
  {'text': 'Monro Inc. has also made significant progress in its acquisitions, with the company announcing the acquisition of three companies with a total of 23 locations. The company expects these acquisitions to add approximately $45 million in annualized sales.',
   'metadata': {}},
  {'text': "* Monro Inc. reported a 0.9% decline in comparable store sales due to mild winter weather conditions.\n* The company's store rebranding and re-imaging initiative has shown promising results, with rebranded stores experiencing an 18% increase in sales

In [None]:
predictions

["Monro Inc.'s CEO, Brett Ponton, stated that the company is committed to driving the necessary changes to improve its business and is confident in its ability to execute its strategy.\n\nMonro Inc.'s CFO, Brian D'Ambrosia, noted that the company's guidance assumes relatively stable overall tire and oil costs for the balance of fiscal 2020.",
 'Monro Inc. reported a 0.9% decline in comparable store sales due to mild winter weather conditions in the Northeast and Midwest.',
 'Monro Inc. announced the acquisition of three companies with a total of 23 locations, which the company expects to add approximately $45 million in annualized sales.',
 'The net loss for Culp, Inc. was $5.1 million for the third quarter.',
 'The Danaher Business System (DBS) is the driving force of Danaher Corporation.',
 "I don't know.\n\nThe provided information is about Danaher Corporation's first quarter 2020 earnings conference call, but it does not mention the actual first quarter results. It only provides a 

In [None]:
def calculate_accuracy(predictions, ground_truths):
    correct_predictions = sum([1 for pred, true in zip(predictions, ground_truths) if pred == true])
    accuracy = correct_predictions / len(ground_truths)
    return accuracy


In [None]:
def precision_at_k(retrieved_docs, relevant_docs, k):
    retrieved_k = retrieved_docs[:k]
    # Extract text content from the retrieved documents for comparison
    relevant_in_k = sum([1 for doc in retrieved_k if doc['text'] in relevant_docs])
    precision_k = relevant_in_k / k
    return precision_k


In [None]:
def recall(retrieved_docs, relevant_docs):
    # Extract text content from the retrieved documents for comparison
    relevant_retrieved = sum([1 for doc in retrieved_docs if doc['text'] in relevant_docs])
    recall_score = relevant_retrieved / len(relevant_docs)
    return recall_score


In [None]:
def mean_reciprocal_rank(retrieved_docs_list, relevant_docs_list):
    reciprocal_ranks = []

    for retrieved_docs, relevant_docs in zip(retrieved_docs_list, relevant_docs_list):
        for rank, doc in enumerate(retrieved_docs, start=1):
            if doc["text"] in relevant_docs:  # Extract text for comparison
                reciprocal_ranks.append(1 / rank)
                break
        else:
            reciprocal_ranks.append(0)  # No relevant document found in retrieved_docs

    return sum(reciprocal_ranks) / len(reciprocal_ranks)


In [None]:
# summary_df['summary'][0]

In [None]:
queries = [" what did monro incs ceo and cfo stated",
           'what did Monro Inc. reported  due to mild winter weather conditions',
           'what did monro incs announced about acquisitions',
           'how much a net loss for Culp, Inc',
           'what is driving force of danaher corp',
           'give Danaher Corporation first quarter results'
]

           # Sample queries
ground_truths = ["he company's CEO, Brett Ponton, stated that the company is committed to driving the necessary changes to improve its business and is confident in its ability to execute its strategy. The company's CFO, Brian D'Ambrosia, noted that the company's guidance assumes relatively stable overall tire and oil costs for the balance of fiscal 2020.",
                 'The company reported a 0.9% decline in comparable store sales, primarily due to mild winter weather conditions in the Northeast and Midwest',
                 'he company has announced the acquisition of three companies with a total of 23 locations, which are expected to add approximately $45 million in annualized sales.',
                 'The company reported a net loss of $5.1 million for the third quarter, compared to a pre-tax income of $4.3 million for the same period last year. The results were affected by a reversal of a $6.1 million recorded contingent earn-out liability and non-cash impairment charges of $13.6 million related to the home accessory division.',
                 ' He highlighted the companys position of strength, with a resilient portfolio of businesses, a talented team, and the Danaher Business System (DBS) as its driving force.',
                 ' first quarter results, with sales growing 3% to $4.3 billion, driven by 4.5% core revenue growth. The impact of foreign currency translation decreased revenues by 1.5%. Gross profit margin was 56.2%, and operating profit margin was 16.1%. Adjusted diluted net earnings per common share were $1.05.\n\nJoyce also discussed the companys outlook for the second quarter, with core revenue growth expected to be in the range of flat to down 10%.'

                 ]  # Expected answers

# for query, ground_truth in zip(queries, ground_truths):
#     result = qa({"query": query})
#     predictions.append(result["result"])
#     retrieved_docs = [doc.page_content for doc in result["source_documents"]]
#     retrieved_docs_list.append(retrieved_docs)



In [None]:
# Calculate evaluation metrics
accuracy = calculate_accuracy(predictions, ground_truths)

TypeError: 'in <string>' requires string as left operand, not dict

In [None]:
print(f"Accuracy: {accuracy}")

Accuracy: 0.0


In [None]:
precision_k = [precision_at_k(retrieved_docs, relevant_docs, k=5) for retrieved_docs, relevant_docs in zip(retrieved_docs_list, ground_truths)]
# recall = [recall(retrieved_docs, relevant_docs) for retrieved_docs, relevant_docs in zip(retrieved_docs_list, ground_truths)]
# mrr = mean_reciprocal_rank(retrieved_docs_list, ground_truths)


print(f"Precision@K: {sum(precision_k) / len(precision_k)}")
# print(f"Recall: {sum(recall) / len(recall)}")
# print(f"Mean Reciprocal Rank (MRR): {mrr}")

Precision@K: 0.03333333333333333


In [None]:
recall = [recall(retrieved_docs, relevant_docs) for retrieved_docs, relevant_docs in zip(retrieved_docs_list, ground_truths)]
# mrr = mean_reciprocal_rank(retrieved_docs_list, ground_truths)
print(f"Recall: {sum(recall) / len(recall)}")
# print(f"Mean Reciprocal Rank (MRR): {mrr}")

Recall: 0.0005020080321285141


In [None]:
mrr = mean_reciprocal_rank(retrieved_docs_list, ground_truths)
print(f"Mean Reciprocal Rank (MRR): {mrr}")

Mean Reciprocal Rank (MRR): 0.08333333333333333


## There is lot of scope for improving the performance like use best llm models like gemini or gpto 4o  for summery and increase summery length or dont define summery length and mention to get better context in crisp form