In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
#YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=iKI_SiWy1DM"
YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=Cj9DKRWp-ek"

In [2]:
from langchain_openai.chat_models import ChatOpenAI
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

In [3]:
model.invoke("who won the icc cricket world cup 2011?")

AIMessage(content='India won the ICC Cricket World Cup in 2011.', response_metadata={'token_usage': {'completion_tokens': 12, 'prompt_tokens': 19, 'total_tokens': 31}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_b28b39ffa8', 'finish_reason': 'stop', 'logprobs': None}, id='run-1c78504e-199b-40e7-ae17-965f2394f306-0')

In [4]:
from langchain_core.output_parsers import StrOutputParser
parser = StrOutputParser()
chain = model | parser
chain.invoke("who won the icc cricket worls cup 2011?")

'India won the ICC Cricket World Cup in 2011.'

In [5]:
from langchain_core.prompts import ChatPromptTemplate  
template = """ 
Answer the question based on the context below. If you can't
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt.format(context="Sendentary lifestyle is one of the major causes of increasing heart issues.", question="What can be one of the possible causes of heart disease today?")

'Human:  \nAnswer the question based on the context below. If you can\'t\nanswer the question, reply "I don\'t know".\n\nContext: Sendentary lifestyle is one of the major causes of increasing heart issues.\n\nQuestion: What can be one of the possible causes of heart disease today?\n'

In [6]:
chain = prompt | model | parser
chain.invoke({
    "context": "Sendentary lifestyle is one of the major causes of increasing heart issues.",
    "question": "What can be one of the possible causes of heart disease today?"
})

'One possible cause of heart disease today is a sedentary lifestyle.'

In [7]:
import tempfile
import whisper
from pytube import YouTube

if not os.path.exists("video_transcript.txt"):
    youtube = YouTube(YOUTUBE_VIDEO)
    audio = youtube.streams.filter(only_audio = True).first()
    
    whisper_model = whisper.load_model("base")
    
    with tempfile.TemporaryDirectory() as tmpdir:
        file = audio.download(output_path=tmpdir)
        transcription = whisper_model.transcribe(file, fp16=False)["text"].strip()
        
        with open("video_transcript.txt", "w") as file:
            file.write(transcription)

In [8]:
# import tempfile
# import whisper

# # Assuming your downloaded audio file is named "downloaded_audio.mp3" in the same directory
# audio_file = "Doc_pat_conv.mp3"

# whisper_model = whisper.load_model("base")
# option = whisper.DecodingOptions(language='en', fp16=False)

# transcription = whisper_model.transcribe(audio_file)

# print(transcription['text'])

# #with open("video_transcript.txt", "w") as file:
# #   file.write(transcription)

In [8]:
with open("video_transcript.txt") as file:
    transcription = file.read()

transcription[:100]

"A The Don't Is Conversation. Good morning, Mr. Johnson. How are you feeling today? Good morning, doc"

In [9]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("video_transcript.txt")
text_documents = loader.load()
text_documents



In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap =20)
text_splitter.split_documents(text_documents)[:5]

[Document(page_content="A The Don't Is Conversation. Good morning, Mr. Johnson. How are you feeling today? Good morning,", metadata={'source': 'video_transcript.txt'}),
 Document(page_content="Good morning, doctor. I've been experiencing some pain in my chest and shortness of breath. It's", metadata={'source': 'video_transcript.txt'}),
 Document(page_content="of breath. It's been bothering me for a few weeks now. I'm glad you came in. It's important not to", metadata={'source': 'video_transcript.txt'}),
 Document(page_content='important not to ignore any chest-related symptoms. Let me ask you a few questions to better', metadata={'source': 'video_transcript.txt'}),
 Document(page_content='questions to better understand your condition. Have you noticed if these symptoms occur during any', metadata={'source': 'video_transcript.txt'})]

In [11]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [12]:
PINECONE_API_KEY = "e28ecfde-af2e-4a7f-9b1e-fff4788ee7e0"
INDEX_NAME = "voice-text"

import os
import openai
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone as PineconeClient
from langchain_pinecone import PineconeVectorStore

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

pc = PineconeClient(api_key=PINECONE_API_KEY)

index = pc.Index(INDEX_NAME)

embedding = OpenAIEmbeddings(openai_api_key = OPENAI_API_KEY)

pinecone = PineconeVectorStore.from_documents(
    text_documents, embeddings, index_name=INDEX_NAME
)

  from tqdm.autonotebook import tqdm


In [14]:
from langchain_core.runnables import RunnablePassthrough


chain = (
    {"context": pinecone.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

chain.invoke("since how long has this been bothering the patient?")


'The patient has been experiencing chest pain and shortness of breath for a few weeks.'