In [None]:
!pip install --quiet git+https://github.com/m-bain/whisperx.git
!pip install --quiet speechbrain fastapi uvicorn fastapi[all] werkzeug fastapi-cors
# Install required libraries
!pip install --quiet pyngrok fastapi uvicorn


In [None]:

import whisperx
import gc
from IPython.display import display
import torch
import os
import subprocess
import datetime
import warnings
from fastapi import FastAPI, File, Form, UploadFile
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from speechbrain.inference.speaker import SpeakerRecognition

In [None]:
# Directories for uploads and results
os.makedirs("uploads", exist_ok=True)
os.makedirs("results", exist_ok=True)

app = FastAPI()
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
app.mount("/results", StaticFiles(directory="results"), name="results")

warnings.filterwarnings("ignore")

# Initialize SpeakerRecognition
verification = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")



In [None]:
# Configurations
WHISPER_MODEL = "large-v2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 4
compute_type = "float32" if torch.cuda.is_available() else "int8"
hf_token = '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'

model = whisperx.load_model(WHISPER_MODEL, device=DEVICE, compute_type=compute_type, language="en")
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=DEVICE)

# Known speaker embeddings
known_speakers = {}

In [None]:

# Utility functions
def ensure_wav_format(file_path):
    wav_path = os.path.splitext(file_path)[0] + ".wav"
    subprocess.call(["ffmpeg", "-i", file_path, wav_path, "-y"])
    return wav_path

def get_file_embedding(file_path):
    signal = verification.load_audio(file_path)
    embedding = verification.encode_batch(signal)
    return embedding.squeeze().cpu().numpy()

def time_format(secs):
    return str(datetime.timedelta(seconds=round(secs)))

In [None]:
from pydantic import BaseModel
from typing import List
from fastapi import UploadFile, Form, File

class EnrollRequest(BaseModel):
    num_speakers: int
    names: List[str]



In [None]:
from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Depends

@app.post("/enroll")
async def enroll_speakers(
    num_speakers: int = Form(...),
    names: List[str] = Form(...),
    files: List[UploadFile] = File(...),
):
    if len(names) != num_speakers:
        raise HTTPException(status_code=400, detail="Number of names doesn't match number of speakers")

    if len(files) != num_speakers:
        raise HTTPException(status_code=400, detail="Number of audio files doesn't match number of speakers")

    for i, name in enumerate(names):
        audio_file = files[i]
        filepath = os.path.join('uploads', audio_file.filename)
        with open(filepath, "wb") as buffer:
            buffer.write(await audio_file.read())
        
        wav_path = ensure_wav_format(filepath)
        embedding = get_file_embedding(wav_path)
        known_speakers[name] = embedding

    return {
        "message": "Speakers enrolled successfully",
        "enrolled_speakers": list(known_speakers.keys())
    }
    
@app.post("/transcribe")
async def transcribe_conversation(audio: UploadFile = File(...)):
    filepath = os.path.join("uploads", audio.filename)
    with open(filepath, "wb") as f:
        f.write(audio.file.read())
    wav_path = ensure_wav_format(filepath)
    
    # Transcribe audio
    audio_data = whisperx.load_audio(wav_path)
    result1 = model.transcribe(audio_data)
    model_a, metadata = whisperx.load_align_model(language_code=result1["language"], device=DEVICE)
    result2 = whisperx.align(result1["segments"], model_a, metadata, audio_data, DEVICE, return_char_alignments=False)
    
    # Diarization
    diarize_segments = diarize_model(audio_data, min_speakers=len(known_speakers))
    result = whisperx.assign_word_speakers(diarize_segments, result2)
    
    # Speaker matching with detailed accuracy
    speaker_mapping = {}
    speaker_accuracies = {}
    available_known_speakers = list(known_speakers.keys())
    
    for diarized_speaker in diarize_segments.speaker.unique():
        speaker_segments = diarize_segments[diarize_segments.speaker == diarized_speaker]
        segment_audio = audio_data[int(speaker_segments.start.iloc[0] * 16000):int(speaker_segments.end.iloc[-1] * 16000)]
        
        # Verify speaker embedding
        diarized_embedding = verification.encode_batch(torch.tensor(segment_audio).unsqueeze(0)).squeeze().cpu().numpy()
        
        # Compute similarity scores with detailed tracking
        scores = {}
        for name in available_known_speakers:
            similarity = verification.similarity(
                torch.tensor(diarized_embedding).unsqueeze(0), 
                torch.tensor(known_speakers[name]).unsqueeze(0)
            ).item()
            scores[name] = {
                'similarity_score': similarity,
                'confidence_percentage': round(similarity * 100, 2)
            }
        
        # Find best match
        best_match = max(scores, key=lambda k: scores[k]['similarity_score'])
        speaker_mapping[diarized_speaker] = best_match
        speaker_accuracies[diarized_speaker] = scores[best_match]
        
        available_known_speakers.remove(best_match)
        if not available_known_speakers:
            break
    
    # Assign unnamed speakers
    for diarized_speaker in diarize_segments.speaker.unique():
        if diarized_speaker not in speaker_mapping:
            unnamed_speaker = f"Unknown_Speaker_{len(speaker_mapping) + 1}"
            speaker_mapping[diarized_speaker] = unnamed_speaker
            speaker_accuracies[diarized_speaker] = {
                'similarity_score': 0,
                'confidence_percentage': 0
            }
    
    # Enhance segments with accuracy information
    for segment in result["segments"]:
        # Add speaker information
        speaker = speaker_mapping.get(segment.get("speaker", ""), "Unknown")
        segment["speaker"] = speaker
        
        # Add transcription confidence
        segment["transcription_confidence"] = round(segment.get("confidence", 0) * 100, 2)
        
        # Add speaker verification accuracy
        segment["speaker_confidence"] = speaker_accuracies.get(
            segment.get("speaker", ""), 
            {'confidence_percentage': 0}
        )['confidence_percentage']
    
    # Save transcription with enhanced details
    transcription_path = os.path.join("results", f"transcription_{audio.filename}.txt")
    with open(transcription_path, "w") as f:
        for segment in result["segments"]:
            f.write(
                f"[{time_format(segment['start'])} - {time_format(segment['end'])}] "
                f"{segment['speaker']} (Speaker Conf: {segment['speaker_confidence']}%, "
                f"Transcription Conf: {segment['transcription_confidence']}%): "
                f"{segment['text']}\n"
            )
    
    # Read the transcription file content
    with open(transcription_path, "r") as f:
        transcript_data = f.read()
    
    return {
        "message": "Transcription completed", 
        "transcription_file": f"/results/transcription_{audio.filename}.txt", 
        "transcription": transcript_data,
        "overall_accuracy": {
            "language_detected": result1["language"],
            "speaker_verification_details": speaker_accuracies
        }
    }

In [None]:
from pyngrok import ngrok
import uvicorn
import threading

# Start ngrok tunnel
NGROK_AUTH_TOKEN = "2pPDOOnxTKaVSPdfEguG5o85TJp_4SDRE9oFkFuRxiyinexzt"  # Replace with your ngrok auth token
ngrok.set_auth_token(NGROK_AUTH_TOKEN)
public_url = ngrok.connect(8000).public_url
print(f"Public URL: {public_url}")

# Function to run FastAPI server
# Run the server directly
def run_server():
    uvicorn.run(app, host="0.0.0.0", port=8000, reload=False)

# Start FastAPI server in a separate thread
server_thread = threading.Thread(target=run_server, daemon=True)
server_thread.start()