In [None]:
!pip install google-generativeai SpeechRecognition pydub nltk pyannote.audio torch torchvision torchaudio


In [None]:
import os
import json
import speech_recognition as sr
from pydub import AudioSegment
import google.generativeai as genai
from pyannote.audio import Pipeline
import nltk

In [None]:
# Download required NLTK resources
nltk.download("punkt")

In [None]:
# Paths
CALLS_FOLDER = "/content/A"
OUTPUT_FOLDER = "/content/A"


In [None]:
# Configure Hugging Face token (replace with your actual token)
HF_TOKEN = ""

# Load Pyannote Diarization Pipeline with authentication
DIARIZATION_MODEL = "pyannote/speaker-diarization"
diarization_pipeline = Pipeline.from_pretrained(DIARIZATION_MODEL, use_auth_token=HF_TOKEN)


In [None]:
# --- Functions ---

def transcribe_audio(file_path):
    """
    Transcribe a WAV file to text using SpeechRecognition.
    Since the audios are already WAV, we can read them directly.
    """
    recognizer = sr.Recognizer()
    try:
        with sr.AudioFile(file_path) as source:
            audio_data = recognizer.record(source)
            transcript = recognizer.recognize_google(audio_data)
    except sr.UnknownValueError:
        transcript = "Audio unclear or unintelligible."
    except sr.RequestError as e:
        transcript = f"SpeechRecognition error: {e}"
    return transcript

def diarize_audio(file_path):
    """
    Obtain speaker segments using the Pyannote diarization pipeline.
    Returns a list of segments, each with start time, end time, and speaker label.
    """
    diarization_result = diarization_pipeline({"uri": "call", "audio": file_path})
    segments = []
    for turn, _, speaker in diarization_result.itertracks(yield_label=True):
        segments.append({"start": turn.start, "end": turn.end, "speaker": speaker})
    return segments

def query_gemini(prompt):
    """
    Query Gemini 1.5 via Google Generative AI with the provided prompt.
    Returns the trimmed text response.
    """
    model = genai.GenerativeModel("gemini-1.5-pro")
    response = model.generate_content(prompt)
    return response.text.strip()

def extract_keywords(transcript, segments):
    """
    Use Gemini 1.5 to extract keywords from the conversation.
    Returns a list of keywords.
    """
    prompt = (
        "Extract the main keywords from the following customer service conversation details.\n\n"
        f"Transcript: {transcript}\n"
        f"Speaker Segments: {segments}\n\n"
        "Return the keywords as a comma-separated list."
    )
    keywords = query_gemini(prompt)
    # Split by commas and remove extra whitespace
    return [kw.strip() for kw in keywords.split(",") if kw.strip()]

def generate_insights(transcript, segments):
    """
    Use Gemini 1.5 to generate insights from the conversation.
    Returns a structured summary of insights.
    """
    prompt = (
        "Analyze the following customer service conversation and provide detailed insights "
        "about customer issues, trends, and pain points.\n\n"
        f"Transcript: {transcript}\n"
        f"Speaker Segments: {segments}\n\n"
        "Provide a structured summary of the insights."
    )
    return query_gemini(prompt)

def generate_suggestions(transcript, segments):
    """
    Use Gemini 1.5 to provide actionable response suggestions for improving the agent's performance.
    Returns a list of suggestions.
    """
    prompt = (
        "Based on the following customer service conversation, suggest improvements for the agent's responses. "
        "Provide actionable suggestions that address customer concerns.\n\n"
        f"Transcript: {transcript}\n"
        f"Speaker Segments: {segments}\n\n"
        "List each suggestion on a new line."
    )
    suggestions = query_gemini(prompt)
    return [sug.strip() for sug in suggestions.split("\n") if sug.strip()]

def process_file(file_path):
    """
    Process a single WAV file: transcribe, diarize, and then use Gemini 1.5 to extract keywords,
    generate insights, and provide suggestions.
    Returns a dictionary of all results.
    """
    print(f"Processing: {os.path.basename(file_path)}")
    transcript = transcribe_audio(file_path)
    segments = diarize_audio(file_path)
    keywords = extract_keywords(transcript, segments)
    insights = generate_insights(transcript, segments)
    suggestions = generate_suggestions(transcript, segments)

    return {
        "transcript": transcript,
        "speaker_segments": segments,
        "keywords": keywords,
        "insights": insights,
        "suggestions": suggestions
    }

In [None]:
import google.generativeai as genai

# Replace with your actual Google API key
genai.configure(api_key="")

In [None]:
# --- Main Processing Loop ---

for filename in os.listdir(CALLS_FOLDER):
    if filename.lower().endswith(".wav"):
        file_path = os.path.join(CALLS_FOLDER, filename)
        result = process_file(file_path)
        output_file = os.path.join(OUTPUT_FOLDER, f"{filename}.json")
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=4)
        print(f"✅ Processed {filename} → Saved to {output_file}")