# Speaker Diarization

In [1]:
from pyannote.audio import Pipeline
import torch
import os
from dotenv import main

main.load_dotenv()

True

In [2]:
audio_file_path = "./audio/1.mp3"

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

In [4]:
HF_TOKEN = os.getenv("HUGGING_FACE_KEY")
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=HF_TOKEN)
pipeline.to(torch.device("cuda"))

<pyannote.audio.pipelines.speaker_diarization.SpeakerDiarization at 0x1bf6b8d5cd0>

In [5]:
diarization = pipeline(audio_file_path)

In [6]:
with open("audio.rttm", "w") as rttm:
    diarization.write_rttm(rttm)

# Voice Transcription

In [7]:
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [8]:
model_id = "distil-whisper/distil-small.en"

In [9]:
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

In [10]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

In [11]:
result = pipe(audio_file_path)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [12]:
with open("transcription.txt", "w") as txt:
    txt.write(result["text"])

# Sentiment Analysis

In [13]:
from nltk.tokenize import sent_tokenize

In [14]:
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device="cuda")


In [15]:
def analyze_sentiment(text):
    # Split the text into sentences
    sentences = sent_tokenize(text)
    
    # Analyze sentiment for each sentence
    results = []
    for sentence in sentences:
        sentiment = sentiment_analyzer(sentence)[0]
        results.append({
            'sentence': sentence,
            'sentiment': sentiment['label'],
            'score': sentiment['score']
        })
    
    return results

In [16]:
with open("sentiment_analysis.txt", "w") as txt:
    for sentence in analyze_sentiment(result["text"]):
        txt.write(f"{sentence['sentence']} - {sentence['sentiment']} - {sentence['score']}\n")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


# Summarization

In [17]:
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import PromptTemplate

In [18]:
template = """
    You are an expert in the field of audio transcription. Your task is to summarize the audio transcription
    and provide a well structured summary of the audio transcription. The conversation will be usually among 
    2 or more speakers. Along with the summary, you are also tasked with extracting any important keywords 
    from the transcription. Do not hallucinate or provide any false information. Do not provide any other information or
    notes. Only provide the summary and the keywords in markdown format.
    
    This is the audio transcription:
    {transcription}
"""

llm = OllamaLLM(model="llama3.1")

prompt = PromptTemplate.from_template(template)
chain = prompt | llm

In [19]:
summary_result = chain.invoke(input={"transcription":result["text"]})

In [20]:
with open("summary.txt", "w") as txt:
    txt.write(summary_result)