In [3]:
import os
import json
import datetime
import pandas as pd
import warnings

# NLP Libraries
import transformers
import torch

import os
import google.generativeai as genai

# Langchain Core
from langchain.schema import HumanMessage, AIMessage, ChatMessage
from langchain_core.tools import tool
from langchain_core.pydantic_v1 import BaseModel, Field

# Langchain LLMs and Agents
from langchain.llms import OpenAI, HuggingFacePipeline, CTransformers
from langchain.chat_models import ChatOpenAI
from langchain.agents import AgentType, load_tools, initialize_agent

# Langchain Chains and Prompts
from langchain.chains import LLMChain, SequentialChain, RetrievalQA, ConversationChain
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
from langchain_core.messages import SystemMessage

# Langchain Memory
from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory

# Langchain Document Processing
from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Langchain Embeddings and Vector Stores
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.vectorstores import Pinecone

# Groq Integration (if used)
from groq import Groq
from langchain_groq import ChatGroq

# PDF Libraries
import pypdf
from PyPDF2 import PdfReader

# External Libraries
import pinecone
from dotenv import load_dotenv

In [5]:
load_dotenv()

True

In [7]:
KEY_groq = os.getenv('GROQ_API_KEY')
KEY_openai = os.getenv('OPENAI_API_KEY')
KEY_GOOGLE  = os.getenv('GOOGLE_API_KEY')

In [8]:
from datasets import load_dataset

ds = load_dataset("distil-whisper/earnings21")

Downloading readme: 100%|██████████| 869/869 [00:00<00:00, 2.85kB/s]
Downloading data: 100%|██████████| 414M/414M [00:32<00:00, 12.6MB/s] 
Downloading data: 100%|██████████| 358M/358M [00:19<00:00, 18.0MB/s] 
Generating test split: 100%|██████████| 44/44 [00:04<00:00, 10.81 examples/s]


In [9]:
ds

DatasetDict({
    test: Dataset({
        features: ['audio', 'file_id', 'audio_length', 'sample_rate', 'company_name', 'financial_quarter', 'sector', 'speaker_switches', 'unique_speakers', 'curator_id', 'transcription'],
        num_rows: 44
    })
})

In [11]:
test_df = pd.read_csv('DATA\Earnings_21.csv')

In [12]:
test_df

Unnamed: 0,file_id,audio_length,sample_rate,company_name,financial_quarter,sector,speaker_switches,unique_speakers,curator_id,transcription
0,4320211,3285.848,24000,Monro Inc,3,Consumer Goods,82,10,1,"Good morning ladies and gentlemen, and welcome..."
1,4330115,2458.904,24000,Culp Inc,3,Industrial Goods,43,8,1,Good day and welcome to Culp's third quarter 2...
2,4341191,5740.64,24000,General Electric,1,Conglomerate,147,14,1,Good morning and welcome to the first quarter ...
3,4344338,2721.169,44100,Danaher Corp,1,Conglomerate,51,7,1,My name is Christelle and I will be your confe...
4,4344866,3275.456,24000,Spire Inc,2,Utilities,82,10,8,"Good morning, and welcome to the Spire Second ..."
5,4346818,3972.022,11025,Ingersoll Rand,1,Industrial Goods,99,14,0,Ladies and gentlemen thank you for standing by...
6,4346923,4709.418,16000,Cementos Argos,1,Industrial Goods,120,20,1,"Hello gentlemen, gent- ladies and gentlemen, a..."
7,4359732,4887.498,44100,Kuehne Nagel International,2,Services,114,13,9,"Ladies and gentlemen, welcome to Kuehne + Nage..."
8,4359971,3759.944,24000,Constellium,2,Industrial Goods,116,10,8,"Ladies and gentlemen, thank you for standing b..."
9,4360366,3906.752,24000,Travelers Companies Inc,2,Financial,104,15,8,"Good morning, ladies and gentlemen. Welcome to..."


#### Saving audiofiles to .mp3 format

In [None]:
import soundfile as sf
import os

for i in range(10):
  audio_data = ds['test'][i]['audio']
  # Extract the audio data
  audio_samples = audio_data['array']
  sample_rate = audio_data['sampling_rate']

  # Specify the output file path in your working directory
  output_filename = os.path.join('/content/', f'output_audio_{i}.mp3')

  # Save the audio file
  sf.write(output_filename, audio_samples, sample_rate)

  print(f"Audio saved as {output_filename}")


In [14]:
import soundfile as sf
import os

audio_data = ds['test'][41]['audio']
# Extract the audio data
audio_samples = audio_data['array']
sample_rate = audio_data['sampling_rate']

# Specify the output file path in your working directory
output_filename = 'output_audio_{41}.mp3'

# Save the audio file
sf.write(output_filename, audio_samples, sample_rate)

print(f"Audio saved as {output_filename}")


Audio saved as output_audio_{41}.mp3


In [15]:
import soundfile as sf
import os

audio_data = ds['test'][10]['audio']
# Extract the audio data
audio_samples = audio_data['array']
sample_rate = audio_data['sampling_rate']

# Specify the output file path in your working directory
output_filename = 'output_audio_10.mp3'

# Save the audio file
sf.write(output_filename, audio_samples, sample_rate)

print(f"Audio saved as {output_filename}")


Audio saved as output_audio_10.mp3


In [21]:
import soundfile as sf
import os

audio_data = ds['test'][11]['audio']
# Extract the audio data
audio_samples = audio_data['array']
sample_rate = audio_data['sampling_rate']

# Specify the output file path in your working directory
output_filename = 'output_audio_11.mp3'

# Save the audio file
sf.write(output_filename, audio_samples, sample_rate)

print(f"Audio saved as {output_filename}")


Audio saved as output_audio_11.mp3


In [22]:
import soundfile as sf
import os

audio_data = ds['test'][12]['audio']
# Extract the audio data
audio_samples = audio_data['array']
sample_rate = audio_data['sampling_rate']

# Specify the output file path in your working directory
output_filename = 'output_audio_12.mp3'

# Save the audio file
sf.write(output_filename, audio_samples, sample_rate)

print(f"Audio saved as {output_filename}")


Audio saved as output_audio_12.mp3


In [23]:
import soundfile as sf
import os

audio_data = ds['test'][13]['audio']
# Extract the audio data
audio_samples = audio_data['array']
sample_rate = audio_data['sampling_rate']

# Specify the output file path in your working directory
output_filename = 'output_audio_13.mp3'

# Save the audio file
sf.write(output_filename, audio_samples, sample_rate)

print(f"Audio saved as {output_filename}")


Audio saved as output_audio_13.mp3


In [16]:
import os
from groq import Groq

# Set up the Groq API client
KEY_groq = os.getenv('GROQ_API_KEY')  # Ensure your API key is set in the environment
client = Groq(api_key=KEY_groq)


In [17]:
def transcribe_audio(filename):
    with open(filename, "rb") as file:
        transcription = client.audio.transcriptions.create(
            file=(filename, file.read()),
            model="whisper-large-v3",
            temperature=0.31,
            language="en",
            response_format="verbose_json",
        )
    return transcription  

# Assuming the transcription result contains a 'text' key

In [None]:
# Full directory path to the audio files
base_path = r"C:\Users\somva\Documents\Work\Earnings_21_RAG\RAG_With_Audio_To_Text\DATA"

# List of audio files with the full path
audio_files = [os.path.join(base_path, f"output_audio_{i}.mp3") for i in range(11)]


# Dictionary to store transcriptions
transcripts_dict = {}

for filename in audio_files:
    transcript = transcribe_audio(filename)
    transcripts_dict[filename] = transcript


In [20]:
transcripts_dict

{}

In [None]:
for filename, transcript in transcripts_dict.items():
    print(f"Transcription for {filename}:")
    print(transcript)
    print("\n" + "="*50 + "\n")
