# Speaker Diarization

In [100]:
from pyannote.audio import Pipeline
import torch
import os
from dotenv import main

main.load_dotenv()

True

In [101]:
audio_file_path = "./audio/1.mp3"

In [102]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

In [103]:
HF_TOKEN = os.getenv("HUGGING_FACE_KEY")
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=HF_TOKEN)
pipeline.to(torch.device("cuda"))

<pyannote.audio.pipelines.speaker_diarization.SpeakerDiarization at 0x2ce986a8210>

In [104]:
diarization = pipeline(audio_file_path)

In [105]:
with open("audio.rttm", "w") as rttm:
    diarization.write_rttm(rttm)

# Voice Transcription

In [106]:
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [107]:
model_id = "distil-whisper/distil-small.en"

In [108]:
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

In [109]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

In [110]:
result = pipe(audio_file_path)


In [111]:
with open("transcription.txt", "w") as txt:
    txt.write(result["text"])

# Sentiment Analysis

In [112]:
from nltk.tokenize import sent_tokenize

In [113]:
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device="cuda")


In [114]:
def analyze_sentiment(text):
    # Split the text into sentences
    sentences = sent_tokenize(text)
    
    # Analyze sentiment for each sentence
    results = []
    for sentence in sentences:
        sentiment = sentiment_analyzer(sentence)[0]
        results.append({
            'sentence': sentence,
            'sentiment': sentiment['label'],
            'score': sentiment['score']
        })
    
    return results

In [115]:
with open("sentiment_analysis.txt", "w") as txt:
    for sentence in analyze_sentiment(result["text"]):
        txt.write(f"{sentence['sentence']} - {sentence['sentiment']} - {sentence['score']}\n")

# Summarization

In [116]:
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import PromptTemplate

In [117]:
template = """
    You are an expert in the field of audio transcription. Your task is to summarize the audio transcription
    and provide a well structured summary of the audio transcription. The conversation will be usually among 
    2 or more speakers. Along with the summary, you are also tasked with extracting any important keywords 
    from the transcription. Do not hallucinate or provide any false information. Do not provide any other information or
    notes. Only provide the summary and the keywords in markdown format.
    
    This is the audio transcription:
    {transcription}
"""

llm = OllamaLLM(model="llama3.1")

prompt = PromptTemplate.from_template(template)
chain = prompt | llm

In [118]:
summary_result = chain.invoke(input={"transcription":result["text"]})

In [119]:
with open("summary.txt", "w") as txt:
    txt.write(summary_result)