In [None]:
# Core libraries
!pip install langchain
!pip install langchain-core
!pip install sentence-transformers
!pip install tiktoken
! pip install -U google-generativeai

# Large language models and vector stores
!pip install openai  # Upgrade later if needed
!pip install groq  # If using Groq for acceleration
!pip install pinecone-client
!pip install chromadb

# Langchain extensions and integrations
!pip install langchain-community
!pip install langchain-pinecone
!pip install langchain-groq  # If using Groq

# Additional dependencies
!pip install huggingface_hub

# Upgrades
!pip install -U langchain-community
!pip install --upgrade openai
!pip install --upgrade langchain-pinecone
!pip install -qU langchain-groq  # Quiet upgrade



Collecting langchain
  Downloading langchain-0.2.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.32 (from langchain)
  Downloading langchain_core-0.2.33-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.100-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.32->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting httpx<1,>=0.23.0 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.7-cp310-cp31

In [None]:
from google.colab import userdata
KEY_groq = userdata.get('GROQ_API_KEY')
KEY_openai = userdata.get('OPENAI_API_KEY')
KEY_GOOGLE  = userdata.get('GOOGLE_API_KEY')

In [None]:
import os
import json
import datetime
import pandas as pd
import warnings

# NLP Libraries
import transformers
import torch

import os
import google.generativeai as genai

# Langchain Core
from langchain.schema import HumanMessage, AIMessage, ChatMessage
from langchain_core.tools import tool
from langchain_core.pydantic_v1 import BaseModel, Field

# Langchain LLMs and Agents
from langchain.llms import OpenAI, HuggingFacePipeline, CTransformers
from langchain.chat_models import ChatOpenAI
from langchain.agents import AgentType, load_tools, initialize_agent

# Langchain Chains and Prompts
from langchain.chains import LLMChain, SequentialChain, RetrievalQA, ConversationChain
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
from langchain_core.messages import SystemMessage

# Langchain Memory
from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory

# Langchain Document Processing
from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Langchain Embeddings and Vector Stores
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.vectorstores import Pinecone

# Groq Integration (if used)
from groq import Groq
from langchain_groq import ChatGroq

# External Libraries
import pinecone

In [None]:
transcript_df = pd.read_csv('/content/drive/MyDrive/DataSets/Earnings_21.csv')

In [None]:
transcript_df.head(11)

Unnamed: 0,file_id,audio_length,sample_rate,company_name,financial_quarter,sector,speaker_switches,unique_speakers,curator_id,transcription
0,4320211,3285.848,24000,Monro Inc,3,Consumer Goods,82,10,1,"Good morning ladies and gentlemen, and welcome..."
1,4330115,2458.904,24000,Culp Inc,3,Industrial Goods,43,8,1,Good day and welcome to Culp's third quarter 2...
2,4341191,5740.64,24000,General Electric,1,Conglomerate,147,14,1,Good morning and welcome to the first quarter ...
3,4344338,2721.169,44100,Danaher Corp,1,Conglomerate,51,7,1,My name is Christelle and I will be your confe...
4,4344866,3275.456,24000,Spire Inc,2,Utilities,82,10,8,"Good morning, and welcome to the Spire Second ..."
5,4346818,3972.022,11025,Ingersoll Rand,1,Industrial Goods,99,14,0,Ladies and gentlemen thank you for standing by...
6,4346923,4709.418,16000,Cementos Argos,1,Industrial Goods,120,20,1,"Hello gentlemen, gent- ladies and gentlemen, a..."
7,4359732,4887.498,44100,Kuehne Nagel International,2,Services,114,13,9,"Ladies and gentlemen, welcome to Kuehne + Nage..."
8,4359971,3759.944,24000,Constellium,2,Industrial Goods,116,10,8,"Ladies and gentlemen, thank you for standing b..."
9,4360366,3906.752,24000,Travelers Companies Inc,2,Financial,104,15,8,"Good morning, ladies and gentlemen. Welcome to..."


In [None]:
len(transcript_df['transcription'][0])

50968

In [None]:
# model="llama-3.1-70b-versatile"
modelgroq = "llama-3.1-70b-versatile"

groq_chat = ChatGroq(groq_api_key=KEY_groq, model_name=modelgroq,temperature=0.3)

In [None]:
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')

### Summerizing The Transcription

In [None]:
# def summarize_text(text):
#     # Use Hugging Face model
#     summary = summarizer(text, max_length=500, min_length=400, do_sample=False)
#     return summary[0]['summary_text']


In [None]:
# # If using OpenAI's API for summarization:
# def summarize_text_openai(text):
#     response = openai.ChatCompletion.create(
#         model="gpt-4",
#         messages=[
#             {"role": "system", "content": "Summarize the following text to 400-500 words."},
#             {"role": "user", "content": text}
#         ]
#     )
#     return response['choices'][0]['message']['content']


In [None]:
# from groq import Groq

# # Set up Groq API key
# client = Groq(
#     api_key=KEY_groq,  # Replace with your actual API key
#     # api_key=os.environ.get("GROQ_API_KEY"),
# )

# # Function to summarize using Groq
# def summarize_text_groq(text):
#     chat_completion = client.chat.completions.create(
#         messages=[
#             {
#                 "role": "user",
#                 "content": f"Summarize the following text into 400-500 words:\n\n{text}",
#             }
#         ],
#         model = "llama-3.1-70b-versatile",
#     )
#     return chat_completion.choices[0].message.content

In [None]:
import os
import time  # For adding delay
import pandas as pd
from groq import Groq

# Set up Groq API key
client = Groq(
    api_key=KEY_groq,  # Replace with your actual API key
)

# Function to summarize using Groq
def summarize_text_groq(text):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": f"Summarize the following text into 400-500 words:\n\n{text}",
                }
            ],
            model = "llama-3.1-70b-versatile",  # Replace with the correct model
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [None]:
# Initialize an empty list to store summaries
summaries = []

# Loop through each row, process one by one
for index, row in transcript_df.head(11).iterrows():  # Process first 11 rows
    print(f"Processing row {index + 1} with file_id {row['file_id']}...")  # Print statement to show progress

    summary = summarize_text_groq(row['transcription'])

    if summary:
        summaries.append({'file_id': row['file_id'], 'summary': summary})
        print(f"Successfully summarized file_id {row['file_id']}.")
    else:
        print(f"Failed to summarize file_id {row['file_id']}.")

    # Add a delay to avoid overwhelming the API
    time.sleep(120)  # Delay for 2 seconds (adjust as needed)
    print("sleep for 2 minutes")


Processing row 1 with file_id 4320211...
Successfully summarized file_id 4320211.
sleep for 2 minutes
Processing row 2 with file_id 4330115...
Successfully summarized file_id 4330115.
sleep for 2 minutes
Processing row 3 with file_id 4341191...
Successfully summarized file_id 4341191.
sleep for 2 minutes
Processing row 4 with file_id 4344338...
Successfully summarized file_id 4344338.
sleep for 2 minutes
Processing row 5 with file_id 4344866...
Successfully summarized file_id 4344866.
sleep for 2 minutes
Processing row 6 with file_id 4346818...
Successfully summarized file_id 4346818.
sleep for 2 minutes
Processing row 7 with file_id 4346923...
Successfully summarized file_id 4346923.
sleep for 2 minutes
Processing row 8 with file_id 4359732...
Successfully summarized file_id 4359732.
sleep for 2 minutes
Processing row 9 with file_id 4359971...
Successfully summarized file_id 4359971.
sleep for 2 minutes
Processing row 10 with file_id 4360366...
Successfully summarized file_id 4360366.

In [None]:
summaries

[{'file_id': 4320211,
  'summary': "Monro Inc. held its third-quarter earnings conference call for fiscal 2020. The company reported a 0.9% decline in comparable store sales, primarily due to mild winter weather conditions in the Northeast and Midwest. Despite this, the company remains confident in its Monro Forward strategy, which includes store rebranding and re-imaging, technological investments, and optimization of its tire category management.\n\nThe company's store rebranding and re-imaging initiative has shown promising results, with rebranded stores experiencing an 18% increase in sales. The company has also made significant progress in its technological investments, including the implementation of a new digital phone and texting system and the development of a cloud-based store staffing and scheduling model.\n\nMonro Inc. has also made significant progress in its acquisitions, with the company announcing the acquisition of three companies with a total of 23 locations. The comp

In [None]:
# Convert summaries to a DataFrame
summary_df = pd.DataFrame(summaries)

# Save the summary DataFrame to a CSV file
summary_df.to_csv('/content/drive/MyDrive/DataSets/Earnings_21_summarized_transcriptions.csv', index=False)


In [None]:
summary_df = pd.read_csv('/content/drive/MyDrive/DataSets/Earnings_21_summarized_transcriptions.csv')

In [None]:
# # Apply summarization function
# transcript_df['summary'] = transcript_df['transcription'].apply(summarize_text_groq)  # or summarize_text_openai

In [None]:
summary_df

Unnamed: 0,file_id,summary
0,4320211,Monro Inc. held its third-quarter earnings con...
1,4330115,"Culp, Inc. hosted a conference call to discuss..."
2,4341191,Given the extensive nature of the text provide...
3,4344338,Danaher Corporation's first quarter 2020 earni...
4,4344866,The Spire Second Quarter Earnings call began w...
5,4346818,The conference call transcript is from Ingerso...
6,4346923,The conference call discussed the first quarte...
7,4359732,Here's a 400-500 word summary of Kuehne + Nage...
8,4359971,Constellium reported its second-quarter earnin...
9,4360366,"The Travelers Companies, Inc. held a conferenc..."


In [None]:
len(summary_df['summary'][0])

3424

In [None]:
summary_df['summary'][8]

"Constellium reported its second-quarter earnings, highlighting the company's ability to navigate through the COVID-19 pandemic. The company's shipment volumes decreased by 25% compared to the same quarter last year, and revenue decreased by 33%. However, the company's cost-control measures and flexible business model helped to mitigate the impact of the pandemic.\n\nConstellium's CEO, Jean-Marc Germain, stated that the company's priority is the health and safety of its employees, and they have implemented various initiatives to protect them. The company also reduced its costs and capital expenditures, with a 40% of its workforce on some type of partial unemployment or temporary layoff scheme during the quarter.\n\nThe company's adjusted EBITDA decreased by 51% compared to the same quarter last year, but its free cash flow was negative €33 million in the second quarter. However, the company expects to generate positive free cash flow in 2020 based on its current view of market conditio

### Chunking And Converting To Embeddings

In [None]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    # text_chunks = text_splitter.split_documents(extracted_data)
    text_chunks = text_splitter.split_text(extracted_data)

    return text_chunks

In [None]:
# # Assuming you already have `summary_df`
# # Initialize an empty list to store chunked data
# chunked_data_1 = []

# # Loop through each row in `summary_df`
# for index, row in summary_df.iterrows():
#     file_id = row['file_id']
#     summary = row['summary']

#     # Create text chunks
#     chunks = text_split(summary)

#     # Create a dictionary for each row and append to `chunked_data`
#     chunked_row = {'file_id': file_id}
#     for i, chunk in enumerate(chunks):
#         chunked_row[f'chunk_{i+1}'] = chunk

#     chunked_data.append(chunked_row)

# Apply text splitting to each summary and store chunks in a single column
summary_df['chunks'] = summary_df['summary'].apply(text_split)

# Save the DataFrame with chunks as lists
summary_df.to_csv('/content/drive/MyDrive/DataSets/Earnings_21_summary_with_chunks.csv', index=False)

In [None]:
file_id = summary_df['file_id'][1]
summarystr = summary_df['summary'][0]
chunks_1 = text_split(summarystr)

In [None]:
len(chunks_1)

10

In [None]:
for chunk in chunks_1:
  print(len(chunk))
  print(chunk)

416
Monro Inc. held its third-quarter earnings conference call for fiscal 2020. The company reported a 0.9% decline in comparable store sales, primarily due to mild winter weather conditions in the Northeast and Midwest. Despite this, the company remains confident in its Monro Forward strategy, which includes store rebranding and re-imaging, technological investments, and optimization of its tire category management.
372
The company's store rebranding and re-imaging initiative has shown promising results, with rebranded stores experiencing an 18% increase in sales. The company has also made significant progress in its technological investments, including the implementation of a new digital phone and texting system and the development of a cloud-based store staffing and scheduling model.
252
Monro Inc. has also made significant progress in its acquisitions, with the company announcing the acquisition of three companies with a total of 23 locations. The company expects these acquisitions

In [None]:
# # Convert chunked data to a DataFrame
# chunked_df = pd.DataFrame(chunked_data)

# # Save the chunked DataFrame to a CSV file
# chunked_df.to_csv('/content/drive/MyDrive/DataSets/Earnings_21_chunked_summaries.csv', index=False)


In [None]:
# summary_df_chunks = pd.read_csv('/content/drive/MyDrive/DataSets/Earnings_21_summary_with_chunks.csv')

In [None]:
# summary_df_chunks.head()

Unnamed: 0,file_id,summary,chunks
0,4320211,Monro Inc. held its third-quarter earnings con...,['Monro Inc. held its third-quarter earnings c...
1,4330115,"Culp, Inc. hosted a conference call to discuss...","['Culp, Inc. hosted a conference call to discu..."
2,4341191,Given the extensive nature of the text provide...,"[""Given the extensive nature of the text provi..."
3,4344338,Danaher Corporation's first quarter 2020 earni...,"[""Danaher Corporation's first quarter 2020 ear..."
4,4344866,The Spire Second Quarter Earnings call began w...,['The Spire Second Quarter Earnings call began...


In [None]:
# summary_df_chunks['chunks'][0]

'[\'Monro Inc. held its third-quarter earnings conference call for fiscal 2020. The company reported a 0.9% decline in comparable store sales, primarily due to mild winter weather conditions in the Northeast and Midwest. Despite this, the company remains confident in its Monro Forward strategy, which includes store rebranding and re-imaging, technological investments, and optimization of its tire category management.\', "The company\'s store rebranding and re-imaging initiative has shown promising results, with rebranded stores experiencing an 18% increase in sales. The company has also made significant progress in its technological investments, including the implementation of a new digital phone and texting system and the development of a cloud-based store staffing and scheduling model.", \'Monro Inc. has also made significant progress in its acquisitions, with the company announcing the acquisition of three companies with a total of 23 locations. The company expects these acquisition

In [None]:
# len(summary_df_chunks['chunks'][0])

3452

In [None]:
# type(summary_df_chunks['chunks'][0])

str

In [None]:
# len(summary_df_chunks['chunks'][0][0])

1

In [None]:
# print(summary_df_chunks['chunks'][0])

['Monro Inc. held its third-quarter earnings conference call for fiscal 2020. The company reported a 0.9% decline in comparable store sales, primarily due to mild winter weather conditions in the Northeast and Midwest. Despite this, the company remains confident in its Monro Forward strategy, which includes store rebranding and re-imaging, technological investments, and optimization of its tire category management.', "The company's store rebranding and re-imaging initiative has shown promising results, with rebranded stores experiencing an 18% increase in sales. The company has also made significant progress in its technological investments, including the implementation of a new digital phone and texting system and the development of a cloud-based store staffing and scheduling model.", 'Monro Inc. has also made significant progress in its acquisitions, with the company announcing the acquisition of three companies with a total of 23 locations. The company expects these acquisitions to 

### TEXT EMBEDDINGS

In [None]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model_embedding = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model_embedding.encode(sentences)
print(embeddings)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[[ 6.76569194e-02  6.34959713e-02  4.87131476e-02  7.93049783e-02
   3.74480933e-02  2.65277131e-03  3.93749513e-02 -7.09846430e-03
   5.93614057e-02  3.15369777e-02  6.00981042e-02 -5.29052615e-02
   4.06067781e-02 -2.59308387e-02  2.98428647e-02  1.12688600e-03
   7.35148042e-02 -5.03817983e-02 -1.22386612e-01  2.37028562e-02
   2.97265332e-02  4.24768738e-02  2.56337579e-02  1.99516211e-03
  -5.69190793e-02 -2.71598026e-02 -3.29035036e-02  6.60248846e-02
   1.19007230e-01 -4.58791181e-02 -7.26214051e-02 -3.25840116e-02
   5.23413308e-02  4.50553037e-02  8.25298298e-03  3.67024504e-02
  -1.39415674e-02  6.53918684e-02 -2.64271945e-02  2.06370896e-04
  -1.36643145e-02 -3.62810642e-02 -1.95043720e-02 -2.89738011e-02
   3.94270122e-02 -8.84091184e-02  2.62427772e-03  1.36713730e-02
   4.83062901e-02 -3.11566498e-02 -1.17329173e-01 -5.11690713e-02
  -8.85288343e-02 -2.18963325e-02  1.42986095e-02  4.44167666e-02
  -1.34815648e-02  7.43392259e-02  2.66382862e-02 -1.98763069e-02
   1.79191

In [None]:
for chunk in chunks_1:
  embedding1 = model_embedding.encode(chunk)
  print(len(chunk))
  print(chunk)
  print(embedding1)

416
Monro Inc. held its third-quarter earnings conference call for fiscal 2020. The company reported a 0.9% decline in comparable store sales, primarily due to mild winter weather conditions in the Northeast and Midwest. Despite this, the company remains confident in its Monro Forward strategy, which includes store rebranding and re-imaging, technological investments, and optimization of its tire category management.
[ 3.09962370e-02  7.43177114e-03 -4.17550618e-04  1.23695284e-02
  4.89719100e-02 -7.71085098e-02 -5.29017709e-02  4.98609655e-02
  2.56649740e-02  4.41023596e-02  2.48296615e-02  3.92573476e-02
 -3.87312435e-02 -2.98920628e-02  2.36047842e-02 -3.87120247e-02
 -2.26181429e-02 -1.08925151e-02  2.53143292e-02  2.29171682e-02
 -5.36330417e-02 -4.80098240e-02  6.15224382e-03  2.53079422e-02
  8.93436605e-04  8.35770890e-02 -8.38970765e-02  2.03637313e-02
 -2.45606117e-02 -4.98260036e-02 -2.23936476e-02  1.11420505e-01
  9.01936740e-02 -5.27747609e-02 -6.34396523e-02  4.9785641

In [None]:
print(embeddings.shape)

(2, 384)


In [None]:
# Apply text splitting and embeddings to each summary and create a new DataFrame with chunks and embeddings
chunk_data = []

for idx, row in summary_df.iterrows():
    file_id = row['file_id']
    summary = row['summary']
    chunks = text_split(summary)

    for chunk in chunks:
        # Generate embeddings for each chunk
        embedding = model_embedding.encode(chunk)
        chunk_data.append({'file_id': file_id, 'chunk': chunk, 'embedding': embedding})

# Create a new DataFrame with the chunks, embeddings, and file_id
chunks_df = pd.DataFrame(chunk_data)


In [None]:
chunks_df

Unnamed: 0,file_id,chunk,embedding
0,4320211,Monro Inc. held its third-quarter earnings con...,"[0.030996237, 0.007431771, -0.00041755062, 0.0..."
1,4320211,The company's store rebranding and re-imaging ...,"[-0.031461723, -0.025675885, 0.028074343, -0.0..."
2,4320211,Monro Inc. has also made significant progress ...,"[0.050692648, -0.07287536, -0.02692151, -0.042..."
3,4320211,The company's guidance for fiscal 2020 has bee...,"[-0.009417127, 0.008365667, 0.057727903, 0.065..."
4,4320211,"The company's CEO, Brett Ponton, stated that t...","[-0.062053658, -0.005355529, 0.004473756, 0.01..."
...,...,...,...
180,4360674,4. **M&A Opportunities**: NextEra Energy's CEO...,"[-0.054499786, 0.019563619, 0.013526839, 0.029..."
181,4360674,5. **Rate Cases**: NextEra Energy plans to fil...,"[-0.059139673, 0.059974324, -0.020970162, 0.00..."
182,4360674,6. **Cost Management**: NextEra Energy has bee...,"[-0.07092738, 0.088025756, -0.036153235, 0.041..."
183,4360674,7. **ESG Report**: NextEra Energy will publish...,"[-0.067683294, 0.05618643, -0.030089203, 0.050..."


In [None]:

# Save the new DataFrame to a CSV file
# Since embeddings are arrays, you may prefer to save the DataFrame as a pickle file instead of CSV
chunks_df.to_pickle('/content/drive/MyDrive/DataSets/Earnings_21_chunks_with_embeddings.pkl')


In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("earningscall")

In [None]:
# !pip install --upgrade langchain-pinecone



In [None]:
import pinecone
import uuid
from sentence_transformers import SentenceTransformer
import json

# # Initialize Pinecone
# # Initialize SentenceTransformer model

# Flatten metadata function
def flatten_metadata(metadata):
    """Flatten metadata to simple key-value pairs."""
    flattened = {}
    for key, value in metadata.items():
        if isinstance(value, (str, int, float, bool)):
            flattened[key] = value
        elif isinstance(value, list) and all(isinstance(i, str) for i in value):
            flattened[key] = value
        else:
            # Convert complex structures to JSON strings
            flattened[key] = json.dumps(value)
    return flattened


In [None]:
def embeddings_of_chunks(chunks):
    embeddings = model_embedding.encode(chunks)
    return embeddings

In [None]:
# # Generate embeddings
# embeddings = model_embedding.encode(summary_df_chunks['chunks'][0])

# # Prepare the batch of vectors to upsert
# vectors = [(str(uuid.uuid4()), embeddings.tolist(),{
#     "text": summary_df_chunks['chunks'][0],
# })]

# # Upsert to Pinecone
# index.upsert(vectors=vectors)

In [None]:
chunks_df['embedding'][0]

array([ 3.09962370e-02,  7.43177114e-03, -4.17550618e-04,  1.23695284e-02,
        4.89719100e-02, -7.71085098e-02, -5.29017709e-02,  4.98609655e-02,
        2.56649740e-02,  4.41023596e-02,  2.48296615e-02,  3.92573476e-02,
       -3.87312435e-02, -2.98920628e-02,  2.36047842e-02, -3.87120247e-02,
       -2.26181429e-02, -1.08925151e-02,  2.53143292e-02,  2.29171682e-02,
       -5.36330417e-02, -4.80098240e-02,  6.15224382e-03,  2.53079422e-02,
        8.93436605e-04,  8.35770890e-02, -8.38970765e-02,  2.03637313e-02,
       -2.45606117e-02, -4.98260036e-02, -2.23936476e-02,  1.11420505e-01,
        9.01936740e-02, -5.27747609e-02, -6.34396523e-02,  4.97856410e-03,
        1.64581463e-02, -3.29438597e-02, -1.15038166e-02, -7.15292096e-02,
       -4.81927842e-02, -5.81656359e-02, -4.67137061e-02, -2.08035763e-02,
        7.23076090e-02, -2.31149606e-02,  3.29730511e-02,  9.81948823e-02,
        6.81476220e-02, -1.70236472e-02,  8.76737852e-03, -6.29271567e-02,
        1.03547364e-01, -

In [None]:
len(chunks_df['embedding'][0])

384

In [None]:

# # Function to process each row in DataFrame and upload to Pinecone
# def process_and_upload_to_pinecone(df, index):
#     # batch_size = 100  # Define your batch size

#     # Iterate through each row in the DataFrame
#     for _, row in df.iterrows():
#         file_id = row['file_id']
#         text_chunks = row['chunks']  # Chunks are stored as lists
#         # Convert chunks to embeddings and upload to Pinecone

#         # batch = text_chunks[i:i+batch_size]

#         # Generate embeddings
#         embeddings = model_embedding.encode(text_chunks)

#         # Prepare the batch of vectors to upsert
#         vectors = [(str(uuid.uuid4()), embeddings.tolist(), {
#             "text": text_chunks
#         })]

#         # Upsert to Pinecone
#         index.upsert(vectors=vectors)
#         print(f"Uploaded embeddings of file {file_id} to Pinecone.")


#     print("All chunks uploaded to Pinecone.")



# Example usage (assuming Pinecone index is initialized and chunks_df is loaded)
# process_and_upload_to_pinecone(chunks_df, pinecone_index)




# # Convert chunks to embeddings and upload to Pinecone
# batch_size = 100
# for i in range(0, len(text_chunks), batch_size):
#     batch = text_chunks[i:i+batch_size]

#     # Generate embeddings
#     embeddings = model.encode([chunk.page_content for chunk in batch])

#     # Prepare the batch of vectors to upsert
#     vectors = [(str(uuid.uuid4()), embedding.tolist(), {
#         "text": chunk.page_content,
#         **flatten_metadata(chunk.metadata)
#     }) for embedding, chunk in zip(embeddings, batch)]

#     # Upsert to Pinecone
#     index.upsert(vectors=vectors)


In [None]:
import uuid
import pinecone  # Make sure you have initialized the Pinecone environment and index

# Function to process each row in the DataFrame and upload to Pinecone
def process_and_upload_to_pinecone(df, index):
    # Iterate through each row in the DataFrame
    for _, row in df.iterrows():
        file_id = row['file_id']
        chunk = row['chunk']  # Each chunk is a string
        embedding = row['embedding']  # Embedding stored as a list or array

        # # Convert embedding back to array if it's a string (in case you loaded from CSV)
        # if isinstance(embedding, str):
        #     embedding = list(map(float, embedding.split(',')))

        # Prepare the vector to upsert
        vectors = [(str(uuid.uuid4()), embedding, {"text": chunk})]

        # Upsert to Pinecone
        index.upsert(vectors=vectors)
        print(f"Uploaded chunk of file {file_id} to Pinecone.")

    print("All chunks uploaded to Pinecone.")



In [None]:
# Example usage with your DataFrame
process_and_upload_to_pinecone(chunks_df, index)

Uploaded chunk of file 4320211 to Pinecone.
Uploaded chunk of file 4320211 to Pinecone.
Uploaded chunk of file 4320211 to Pinecone.
Uploaded chunk of file 4320211 to Pinecone.
Uploaded chunk of file 4320211 to Pinecone.
Uploaded chunk of file 4320211 to Pinecone.
Uploaded chunk of file 4320211 to Pinecone.
Uploaded chunk of file 4320211 to Pinecone.
Uploaded chunk of file 4320211 to Pinecone.
Uploaded chunk of file 4320211 to Pinecone.
Uploaded chunk of file 4330115 to Pinecone.
Uploaded chunk of file 4330115 to Pinecone.
Uploaded chunk of file 4330115 to Pinecone.
Uploaded chunk of file 4330115 to Pinecone.
Uploaded chunk of file 4330115 to Pinecone.
Uploaded chunk of file 4330115 to Pinecone.
Uploaded chunk of file 4330115 to Pinecone.
Uploaded chunk of file 4330115 to Pinecone.
Uploaded chunk of file 4330115 to Pinecone.
Uploaded chunk of file 4330115 to Pinecone.
Uploaded chunk of file 4330115 to Pinecone.
Uploaded chunk of file 4341191 to Pinecone.
Uploaded chunk of file 4341191 t

In [None]:
import pinecone
from sentence_transformers import SentenceTransformer


def query_pinecone(query_text, top_k=5):
    # Generate the embedding for the query text
    query_embedding = model_embedding.encode(query_text).tolist()

    # Query the index
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

    return results

# Example usage
query = "for Constellium by how much % shipment volumes decreased ?"
results = query_pinecone(query)

# Print results
for match in results['matches']:
    print(f"Score: {match['score']:.2f}")
    print(f"Text: {match['metadata']}")
    print("Other Metadata:", {k:v for k,v in match['metadata'].items() if k != 'text'})
    print("---")

Score: 0.67
Text: {'text': "Constellium's packaging business was impacted by the pandemic, with shipment volumes down 11% compared to the same quarter last year. However, the company is seeing strong demand in North America and stable demand in Europe. The company's aerospace business was also impacted, with shipment volumes down 40% compared to the same quarter last year."}
Other Metadata: {}
---
Score: 0.58
Text: {'text': 'no volume large year were time were logistics back sea profit air volumes only will Air up head was two period half single quarter re half them next can lower for can new also over know low even said for with same full   last back head negative see these has by first air Nagel we was said not large out per out is cargo on are part before it high last much on have single last growth two growth single much sea rate growth, now drop overall some be last were if three same when more   through more'}
Other Metadata: {}
---
Score: 0.57
Text: {'text': "Constellium reporte

In [None]:
prompt_template="""
Based on the provided information, please answer the user's question accurately. If the information is insufficient or the answer is unknown, simply respond with "I don't know."

Context: {context}
Question: {question}

Provide a clear and helpful answer below:
Answer:
"""

In [None]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [None]:
model_Llama = "llama-3.1-70b-versatile"

groq_chat = ChatGroq(groq_api_key=KEY_groq, model_name=model_Llama,temperature=0.25)
llm = groq_chat

In [None]:
import pinecone
from sentence_transformers import SentenceTransformer
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import BaseRetriever
from langchain.docstore.document import Document
from typing import List

def query_pinecone(query_text, top_k=5):
    # Generate the embedding for the query text
    query_embedding = model_embedding.encode(query_text).tolist()

    # Query the index
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

    return results

class CustomPineconeRetriever(BaseRetriever):
    def get_relevant_documents(self, query: str) -> List[Document]:
        results = query_pinecone(query)
        docs = []
        for match in results['matches']:
            metadata = match['metadata']
            text = metadata.pop('text', '')  # Remove 'text' from metadata and use it as the main content
            docs.append(Document(page_content=text, metadata=metadata))
        return docs

    async def aget_relevant_documents(self, query: str) -> List[Document]:
        return self.get_relevant_documents(query)

# Initialize the language model
llm = groq_chat

# Set up a custom prompt template
prompt_template="""
Based on the provided information, please answer the user's question accurately. If the information is insufficient or the answer is unknown, simply respond with "I don't know."

Context: {context}
Question: {question}

Provide a clear and helpful answer below:
Answer:
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

# Create the custom retriever
custom_retriever = CustomPineconeRetriever()

# Set up the RetrievalQA chain with the custom retriever
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=custom_retriever,
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

# Interactive QA loop
while True:
    user_input = input("Ask a question (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break

    result = qa({"query": user_input})
    print("\nResponse:", result["result"])

    # Optionally, print source documents
    print("\nSource Documents:")
    for doc in result["source_documents"]:
        print(f"- {doc.page_content[:100]}...")  # Print first 100 chars of each source

    print("\n" + "-"*50 + "\n")

Ask a question (or type 'exit' to quit): what did Monro Inc. reported  due to mild winter weather conditions

Response: Monro Inc. reported a 0.9% decline in comparable store sales due to mild winter weather conditions.

Source Documents:
- Monro Inc. held its third-quarter earnings conference call for fiscal 2020. The company reported a 0...
- * Monro Inc. reported a 0.9% decline in comparable store sales due to mild winter weather conditions...
- Monro Inc. has also made significant progress in its acquisitions, with the company announcing the a...
- Steve Rasche presented the company's financial results, including a decline in net economic earnings...
- The company's guidance for fiscal 2020 has been updated to reflect the impact of the mild winter wea...

--------------------------------------------------

Ask a question (or type 'exit' to quit): what did monro incs ceo and cfo stated

Response: Monro Inc.'s CEO, Brett Ponton, stated that the company is committed to driving the nec

In [None]:
summary_df['summary'][0]

"Monro Inc. held its third-quarter earnings conference call for fiscal 2020. The company reported a 0.9% decline in comparable store sales, primarily due to mild winter weather conditions in the Northeast and Midwest. Despite this, the company remains confident in its Monro Forward strategy, which includes store rebranding and re-imaging, technological investments, and optimization of its tire category management.\n\nThe company's store rebranding and re-imaging initiative has shown promising results, with rebranded stores experiencing an 18% increase in sales. The company has also made significant progress in its technological investments, including the implementation of a new digital phone and texting system and the development of a cloud-based store staffing and scheduling model.\n\nMonro Inc. has also made significant progress in its acquisitions, with the company announcing the acquisition of three companies with a total of 23 locations. The company expects these acquisitions to a

In [None]:
import pinecone
from sentence_transformers import SentenceTransformer
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import BaseRetriever
from langchain.docstore.document import Document
from typing import List

def query_pinecone(query_text, top_k=5):
    # Generate the embedding for the query text
    query_embedding = model_embedding.encode(query_text).tolist()

    # Query the index
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

    return results

class CustomPineconeRetriever(BaseRetriever):
    def get_relevant_documents(self, query: str) -> List[Document]:
        results = query_pinecone(query)
        docs = []
        for match in results['matches']:
            metadata = match['metadata']
            text = metadata.pop('text', '')  # Remove 'text' from metadata and use it as the main content
            docs.append(Document(page_content=text, metadata=metadata))
        return docs

    async def aget_relevant_documents(self, query: str) -> List[Document]:
        return self.get_relevant_documents(query)

# Initialize the language model
llm = groq_chat

# Set up a custom prompt template
prompt_template="""
Based on the provided information, please answer the user's question accurately. If the information is insufficient or the answer is unknown, simply respond with "I don't know."

Context: {context}
Question: {question}

Provide a clear and helpful answer below:
Answer:
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

# Create the custom retriever
custom_retriever = CustomPineconeRetriever()

# Set up the RetrievalQA chain with the custom retriever
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=custom_retriever,
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

# Interactive QA loop
while True:
    user_input = input("Ask a question (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break

    result = qa({"query": user_input})
    print("\nResponse:", result["result"])

    # Optionally, print source documents
    print("\nSource Documents:")
    for doc in result["source_documents"]:
        print(f"- {doc.page_content[:100]}...")  # Print first 100 chars of each source

    print("\n" + "-"*50 + "\n")

Ask a question (or type 'exit' to quit): which is second company from start

Response: I don't know.

Source Documents:
- in after same less has will second there have   first high can less much there significant business ...
- only business more second out quarter CEO year as just next profit   however even low business less ...
- freight units it despite are before know growth years next did next came CEO (period market were Sea...
- The Spire Second Quarter Earnings call began with a welcome from Scott Dudley, Managing Director of ...
- Those well volumes new it , . - per two they financial still when months Nagel no unit   Sea due can...

--------------------------------------------------

Ask a question (or type 'exit' to quit): who is ceo of culp inc

Response: Iv Culp is the Chief Executive Officer of Culp, Inc.

Source Documents:
- Culp, Inc. hosted a conference call to discuss its third quarter 2020 earnings results. The call fea...
- Culp's global platform is seen as a distin

InternalServerError: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}

In [None]:
summary_df['summary'][1]

"Culp, Inc. hosted a conference call to discuss its third quarter 2020 earnings results. The call featured Iv Culp, Chief Executive Officer; Ken Bowling, Chief Financial Officer; and Boyd Chumbley, President of the upholstery fabrics business.\n\nThe company reported a net loss of $5.1 million for the third quarter, compared to a pre-tax income of $4.3 million for the same period last year. The results were affected by a reversal of a $6.1 million recorded contingent earn-out liability and non-cash impairment charges of $13.6 million related to the home accessory division.\n\nCulp's global platform is seen as a distinct competitive advantage, allowing the company to quickly respond to changing market dynamics. The company is also benefiting from significant synergies across its business segments, fostering collaboration and product development.\n\nDespite the challenges in the mattress fabrics business, the company remains focused on creative designs, innovative products, and exception

In [None]:
import pinecone
from sentence_transformers import SentenceTransformer
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import BaseRetriever
from langchain.docstore.document import Document
from typing import List

def query_pinecone(query_text, top_k=5):
    # Generate the embedding for the query text
    query_embedding = model_embedding.encode(query_text).tolist()

    # Query the index
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

    return results

class CustomPineconeRetriever(BaseRetriever):
    def get_relevant_documents(self, query: str) -> List[Document]:
        results = query_pinecone(query)
        docs = []
        for match in results['matches']:
            metadata = match['metadata']
            text = metadata.pop('text', '')  # Remove 'text' from metadata and use it as the main content
            docs.append(Document(page_content=text, metadata=metadata))
        return docs

    async def aget_relevant_documents(self, query: str) -> List[Document]:
        return self.get_relevant_documents(query)

# Initialize the language model
llm = groq_chat

# Set up a custom prompt template
prompt_template="""
Based on the provided information, please answer the user's question accurately. If the information is insufficient or the answer is unknown, simply respond with "I don't know."

Context: {context}
Question: {question}

Provide a clear and helpful answer below:
Answer:
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

# Create the custom retriever
custom_retriever = CustomPineconeRetriever()

# Set up the RetrievalQA chain with the custom retriever
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=custom_retriever,
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

# Interactive QA loop
while True:
    user_input = input("Ask a question (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break

    result = qa({"query": user_input})
    print("\nResponse:", result["result"])

    # Optionally, print source documents
    print("\nSource Documents:")
    for doc in result["source_documents"]:
        print(f"- {doc.page_content[:100]}...")  # Print first 100 chars of each source

    print("\n" + "-"*50 + "\n")

Ask a question (or type 'exit' to quit): what is driving force of danaher corp

Response: The driving force of Danaher Corporation is the Danaher Business System (DBS).

Source Documents:
- Danaher Corporation's first quarter 2020 earnings conference call took place, led by conference faci...
- Tom Joyce started by acknowledging the challenges posed by the COVID-19 pandemic and expressing grat...
- In his final remarks, Joyce reflected on his decision to retire as CEO, effective September 1, 2020,...
- The company's CEO, Brett Ponton, stated that the company is committed to driving the necessary chang...
- During the Q&A session, analysts asked questions about the company's decremental margins, cost savin...

--------------------------------------------------

Ask a question (or type 'exit' to quit): give review of dahher corps first quarter results

Response: I don't know.

Source Documents:
- best than said high even units big , some out had so half same years during are which last s

In [None]:
# len(transcript_df['transcription'][2])

In [None]:
summary_df['summary'][3]

"Danaher Corporation's first quarter 2020 earnings conference call took place, led by conference facilitator Christelle and featuring Tom Joyce, President and CEO, and Matt McGrew, Executive Vice President and CFO. The call began with a welcome and introduction, followed by a review of the company's earnings release, slide presentation, and other relevant materials available on the investor section of the Danaher website.\n\nTom Joyce started by acknowledging the challenges posed by the COVID-19 pandemic and expressing gratitude to Danaher's associates, suppliers, and business partners for their efforts in responding to the crisis. He highlighted the company's position of strength, with a resilient portfolio of businesses, a talented team, and the Danaher Business System (DBS) as its driving force.\n\nJoyce then discussed the company's innovative solutions that are part of the direct response to COVID-19, including IDT's primer and probe kits, Cepheid's rapid molecular test, and Beckma