In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install torch torchaudio speechbrain librosa numpy streamlit streamlit_chat torch transformers faiss-cpu google-generativeai flashrank langchain pydantic
!pip install python-docx tiktoken torch torchaudio librosa ffmpeg-python speechbrain scipy tqdm  json5 pyngrok
!pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-i_pw1q0d
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-i_pw1q0d
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
import os
import torch
import torchaudio
import numpy as np
import librosa
from speechbrain.inference import SpeakerRecognition
import tempfile
import random

# Check if the required directories exist; create them if not
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

# Enhanced function to load audio files and extract embeddings
def load_audio_and_extract_embeddings(audio_file_path):
    # Load the audio file
    signal, sample_rate = torchaudio.load(audio_file_path)

    # Ensure the audio is mono
    if signal.shape[0] > 1:
        signal = signal.mean(dim=0, keepdim=True)

    # Resample if necessary
    if sample_rate != 16000:
        signal = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(signal)

    # Apply pre-emphasis filter to enhance speech features
    signal_np = signal.squeeze().numpy()
    emphasized_signal = np.append(signal_np[0], signal_np[1:] - 0.97 * signal_np[:-1])
    signal = torch.FloatTensor(emphasized_signal).unsqueeze(0)

    # Voice activity detection to remove silence
    intervals = librosa.effects.split(signal_np, top_db=20)
    if len(intervals) > 0:
        y_vad = np.concatenate([signal_np[start:end] for start, end in intervals])
        signal = torch.FloatTensor(y_vad).unsqueeze(0)

    return signal, 16000

# Data augmentation function
def augment_audio(signal, sample_rate):
    augmented_signals = [signal]  # Start with original signal
    signal_np = signal.squeeze().numpy()

    # Speed perturbation (time stretching)
    for rate in [0.9, 1.1]:  # 10% slower and 10% faster
        y_stretched = librosa.effects.time_stretch(signal_np, rate=rate)
        augmented_signals.append(torch.FloatTensor(y_stretched).unsqueeze(0))

    # Pitch shifting (shifts the pitch without changing tempo)
    for n_steps in [-2, 2]:  # Shift pitch down and up by 2 semitones
        y_shifted = librosa.effects.pitch_shift(signal_np, sr=sample_rate, n_steps=n_steps)
        augmented_signals.append(torch.FloatTensor(y_shifted).unsqueeze(0))

    # Adding noise at different SNRs
    for snr_db in [20, 15]:  # Signal-to-noise ratio in dB
        noise = np.random.randn(len(signal_np))
        signal_power = np.mean(signal_np ** 2)
        noise_power = np.mean(noise ** 2)

        # Calculate the factor to scale noise to desired SNR
        snr = 10 ** (snr_db / 10)
        scale = np.sqrt(signal_power / (noise_power * snr))

        noisy_signal = signal_np + scale * noise
        augmented_signals.append(torch.FloatTensor(noisy_signal).unsqueeze(0))

    # Random volume change
    for volume in [0.8, 1.2]:  # 20% quieter and 20% louder
        augmented_signals.append(signal * volume)

    return augmented_signals

# Improved training function with data augmentation
def train_model(audio_files, labels):
    classifier = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmpdir_ecapa")

    # Iterate over each audio file
    for audio_file, label in zip(audio_files, labels):
        print(f"Processing {label} from {audio_file}")
        signal, sample_rate = load_audio_and_extract_embeddings(audio_file)

        # Get duration of the audio
        segments_duration = len(signal.squeeze()) / sample_rate
        print(f"Duration of {audio_file}: {segments_duration:.2f} seconds")

        # Apply data augmentation
        augmented_signals = augment_audio(signal, sample_rate)
        print(f"Created {len(augmented_signals)} augmented versions (including original)")

        all_embeddings = []

        # Process each signal (original and augmented)
        for idx, aug_signal in enumerate(augmented_signals):
            # For longer recordings, use chunking for more robust embeddings
            if len(aug_signal.squeeze()) > 5 * sample_rate:  # If longer than 5 seconds
                chunk_length = int(3 * sample_rate)  # 3-second chunks
                hop_length = int(1.5 * sample_rate)  # 50% overlap

                chunks = []
                aug_signal_np = aug_signal.squeeze().numpy()

                # Create overlapping chunks
                for i in range(0, len(aug_signal_np) - chunk_length + 1, hop_length):
                    chunk = aug_signal_np[i:i + chunk_length]
                    chunks.append(torch.FloatTensor(chunk).unsqueeze(0))

                # Get embedding for each chunk
                chunk_embeddings = []
                for chunk in chunks:
                    emb = classifier.encode_batch(chunk).squeeze().detach().numpy()
                    chunk_embeddings.append(emb)

                # Average the embeddings for a more robust representation
                if chunk_embeddings:
                    emb = np.mean(np.array(chunk_embeddings), axis=0)
                    all_embeddings.append(emb)
            else:
                # For shorter recordings, just get the single embedding
                try:
                    emb = classifier.encode_batch(aug_signal.unsqueeze(0)).squeeze().detach().numpy()
                    all_embeddings.append(emb)
                except Exception as e:
                    print(f"Error processing augmented signal {idx}: {str(e)}")

        # Average all embeddings (original and augmented)
        if all_embeddings:
            segment_embedding = np.mean(np.array(all_embeddings), axis=0)

            # Normalize the embedding (important for cosine similarity)
            segment_embedding = segment_embedding / np.linalg.norm(segment_embedding)

            # Save embedding as .npy file
            np.save(f"embeddings4/{label}.npy", segment_embedding)
            print(f"Saved enhanced embedding for {label} combining {len(all_embeddings)} variants")
        else:
            print(f"Warning: No successful embeddings generated for {label}")

if __name__ == "__main__":
    # Create necessary directories
    create_directory("/content/embeddings4")

    # Example audio files and labels
    audio_files = [
        "/content/drive/MyDrive/PropelloAI_assign-1/nisha_audio.mp3"
    ]

    labels = [
    "Nisha"]


    # Train the model
    train_model(audio_files, labels)


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/tmpdir_ecapa/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in tmpdir_ecapa.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/tmpdir_ecapa/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/tmpdir_ecapa/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/content/tmpdir_ecapa/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/tmpdir_ecapa/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Using symlink found at '/content/tmpdir_ecapa/classif

Processing Nisha from /content/drive/MyDrive/PropelloAI_assign-1/nisha_audio.mp3
Duration of /content/drive/MyDrive/PropelloAI_assign-1/nisha_audio.mp3: 10.43 seconds
Created 9 augmented versions (including original)
Saved enhanced embedding for Nisha combining 9 variants


In [None]:
%%writefile app.py
from docx import Document
import os
import torch
import json
import numpy as np
import torchaudio
import pathlib
import tempfile
import librosa
import ffmpeg
import streamlit as st
from streamlit_chat import message
from tqdm import tqdm
from functools import lru_cache
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import cosine
from speechbrain.inference import SpeakerRecognition
import whisper
from transformers import AutoTokenizer, AutoModel
import faiss
import google.generativeai as genai
from flashrank.Ranker import Ranker, RerankRequest
from io import BytesIO
from langchain.memory import ConversationBufferMemory
import time
import requests
from speechbrain.inference.VAD import VAD
start_time = time.time()
genai.configure(api_key="AIzaSyArG3gnpZHnzi10mMSnyOMhzYJBeAZEJUs")
st.set_page_config(page_title="AI Meeting Assistant", layout="wide")
st.title("📄 AI-Powered Meeting Assistant")
# Caching Whisper and SpeakerRecognition models
@lru_cache(maxsize=1)
def load_whisper_model():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    return whisper.load_model("large", device=device)

@lru_cache(maxsize=1)
def load_classifier_model():
    return SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmpdir_ecapa")

def load_embeddings(embedding_dir):
    embeddings = {}
    for file in os.listdir(embedding_dir):
        if file.endswith(".npy"):
            speaker_label = file.split(".npy")[0]
            embeddings[speaker_label] = np.load(os.path.join(embedding_dir, file))
    return embeddings

def segment_audio(signal, sample_rate, segment_duration=0.3):
    chunk_samples = int(segment_duration * sample_rate)
    segments = []
    for start in range(0, len(signal), chunk_samples):
        end = start + chunk_samples
        segment = signal[start:end]
        if len(segment) == chunk_samples:
            start_time = start/sample_rate
            end_time = end/sample_rate
            segments.append((start_time,end_time,segment))
    return segments
def diarize_and_transcribe(test_audio_path, embeddings, chunk_duration=3.0, transcribe_duration=75.0, similarity_threshold=0.35):
    # Load audio
    signal, sample_rate = torchaudio.load(test_audio_path)
    if signal.shape[0] > 1:
        signal = signal.mean(dim=0, keepdim=True)
    if sample_rate != 16000:
        signal = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(signal)
    sample_rate = 16000
    # Init
    tmpdir = pathlib.Path(tempfile.mkdtemp())
    whisper_model = whisper.load_model("medium")
    classifier = load_classifier_model()
    diarization_result = []
    all_segments = []

    # Process chunks
    total_duration = librosa.get_duration(y=signal.numpy(), sr=sample_rate)
    num_chunks = int(total_duration // transcribe_duration) + 1

    for i in tqdm(range(num_chunks), desc="Chunking"):
        start_time = i * transcribe_duration
        chunk_path = tmpdir / f"chunk_{i}.wav"

        stream = ffmpeg.input(test_audio_path, ss=start_time, t=transcribe_duration)
        stream = stream.output(str(chunk_path), format='wav', acodec='pcm_s16le', ac=1, ar='16k').overwrite_output()
        ffmpeg.run(stream, capture_stdout=True, capture_stderr=True)

        chunk_signal, _ = torchaudio.load(str(chunk_path))
        segments = segment_audio(chunk_signal.squeeze(0), sample_rate, segment_duration=chunk_duration)

        for seg_start, seg_end, segment in segments:
            full_start = start_time + seg_start
            full_end = start_time + seg_end

            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
                torchaudio.save(temp_wav.name, segment.unsqueeze(0), sample_rate=sample_rate)
                result = whisper_model.transcribe(temp_wav.name, task='translate')
                text = result['text']

            emb = classifier.encode_batch(segment.unsqueeze(0)).squeeze().detach().numpy()
            emb = emb / np.linalg.norm(emb)

            all_segments.append({
                "start_time": full_start,
                "end_time": full_end,
                "embedding": emb,
                "text": text
            })

    # === CLUSTERING STAGE ===
    all_embeddings = np.array([s["embedding"] for s in all_segments])
    #reduced_embeddings = PCA(n_components=min(64, all_embeddings.shape[1])).fit_transform(all_embeddings)

    clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5, linkage='average')
    cluster_labels = clustering.fit_predict(all_embeddings)

    # Assign labels
    known_embeddings = {k: v / np.linalg.norm(v) for k, v in embeddings.items()}
    cluster_to_label = {}
    unknown_count = 1

    for cluster_id in set(cluster_labels):
        indices = [i for i, label in enumerate(cluster_labels) if label == cluster_id]
        cluster_embs = np.array([all_segments[i]["embedding"] for i in indices])
        cluster_centroid = np.mean(cluster_embs, axis=0)
        cluster_centroid /= np.linalg.norm(cluster_centroid)

        best_match = None
        best_score = -1
        for speaker, known_emb in known_embeddings.items():
            score = 1 - cosine(cluster_centroid, known_emb)
            if score > best_score:
                best_score = score
                best_match = speaker

        if best_score >= similarity_threshold:
            cluster_to_label[cluster_id] = best_match
        else:
            cluster_to_label[cluster_id] = f"unknown_speaker_{unknown_count}"
            unknown_count += 1

    # Assign speaker labels to segments
    for idx, segment in enumerate(all_segments):
        cluster_id = cluster_labels[idx]
        diarization_result.append({
            "start_time": segment["start_time"],
            "end_time": segment["end_time"],
            "speaker": cluster_to_label[cluster_id],
            "text": segment["text"],
            "similarity_score": None,
            "all_similarity_scores": {}
        })

    # Clean up
    for chunk in tmpdir.iterdir():
        chunk.unlink()
    tmpdir.rmdir()

    return sorted(diarization_result, key=lambda x: x["start_time"])

def merge_consecutive_segments(diarization_result, max_gap=0.5):
    merged_result = []
    current_segment = None

    for segment in diarization_result:
        if current_segment is None:
            current_segment = segment
        else:
            gap = segment["start_time"] - current_segment["end_time"]
            if current_segment["speaker"] == segment["speaker"] and gap <= max_gap:
                current_segment["end_time"] = segment["end_time"]
                current_segment["text"] += " " + segment["text"]
            else:
                merged_result.append(current_segment)
                current_segment = segment

    if current_segment is not None:
        merged_result.append(current_segment)

    return merged_result
def extract_text_from_docx(docx_file):
    """Extracts text from a reference MoM document."""
    doc = Document(docx_file)
    return "\n".join([para.text for para in doc.paragraphs])

def convert_numpy_types(obj):
    if isinstance(obj, np.generic):
        return obj.item()
    elif isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(element) for element in obj]
    else:
        return obj

def generate_mom_with_gemini(diarization_json_path, reference_mom, present_speakers, all_speakers):
    """Generates MoM in structured format using Gemini AI."""
    with open(diarization_json_path, "r") as f:
        data = json.load(f)

    # Speaker-wise meeting summary
    discussions = [f"{item['speaker']}: {item['text']}" for item in data]
    full_text = "\n".join(discussions)

    # Determine absentees
    absent_speakers = [s for s in all_speakers if s not in present_speakers]

    # Format attendees and absentees
    attendees_list = "\n".join([f"- {name}" for name in present_speakers])
    absentees_list = "\n".join([f"- {name}" for name in absent_speakers])

    # AI Prompt for MoM
    prompt = f"""
    You are given a list of conversation segments with speaker labels and their spoken text. Your task is to generate a professionally structured *Transcript Document* that matches the exact formatting style of the provided reference transcript.

    ---

    ### Reference Document Style:
    {reference_mom}

    ---

    ### Formatting Instructions:
    1. **Heading and Date** – Include a clear heading for the transcript and today’s date at the top.
    2. **NOTICE Section** – Include a short formal notice introducing the nature of the transcript (e.g., "This transcript captures a conversation between the listed participants.").
    3. **AGENDA FOR THE CONVERSATION** – Present in the following order:
        - **List of Attendees:**
    {attendees_list}

        - **List of Absentees:**
    {absentees_list}

    4. **Conversation Format** – Follow these strictly:
        - Start each speaker turn with the speaker’s name followed by a colon (`Speaker:`)
        - The spoken dialogue must appear on the *same line* after the colon
        - Insert a blank line between each speaker turn
        - Keep speaker names exactly as provided (e.g., "Bot", "User", "Rashmi", etc.)
        - Do **not** merge, summarize, or omit any utterances
        - Do **not** include timestamps or any technical metadata
        - Do **not** correct grammar or punctuation — preserve the speaker’s original style

    ---

    ### Now generate the final transcript for the following conversation:
    {full_text}
    """


    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(prompt)
    mom_text = response.text

    # Create MoM Document
    doc = Document()
    doc.add_heading('Conversation Transcript', level=1)
    doc.add_paragraph(mom_text)

    mom_buffer = BytesIO()
    doc.save(mom_buffer)
    mom_buffer.seek(0)
    return mom_buffer



uploaded_mom = st.file_uploader("Upload a reference Transcript (Optional)", type=["docx"])
reference_mom = ""
if uploaded_mom:
    reference_mom = extract_text_from_docx(uploaded_mom)
uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3"])
embeddings = load_embeddings("/content/embeddings4")
st.sidebar.header("🔊 Select Speakers for Matching")
selected_speakers = []
for speaker in sorted(embeddings.keys()):
  if st.sidebar.checkbox(speaker, value=True):
    selected_speakers.append(speaker)
selected_embeddings = {k: v for k, v in embeddings.items() if k in selected_speakers}
all_embeddings = load_embeddings("/content/embeddings4")
if uploaded_file and "processed_file" not in st.session_state:
    with st.spinner("Processing audio..."):
        temp_audio_path = f"/tmp/{uploaded_file.name}"
        with open(temp_audio_path, "wb") as f:
            f.write(uploaded_file.read())

        diarization_result = diarize_and_transcribe(temp_audio_path, selected_embeddings, chunk_duration=5.0, transcribe_duration=75.0)
        merged_result = merge_consecutive_segments(diarization_result)

        output_json_path = "diarization_result4.json"
        with open(output_json_path, "w") as json_file:
            json.dump(merged_result, json_file, indent=4)

    st.session_state["processed_file"] = output_json_path
    st.success("Diarization completed!")

    if reference_mom:
        # 🔥 Pass present and all speaker names here
        all_speakers = sorted(all_embeddings.keys())
        mom_doc = generate_mom_with_gemini(output_json_path, reference_mom, selected_speakers, all_speakers)
        st.download_button("📥 Download the transcript", mom_doc, file_name="conversation.docx")
    else:
        st.warning("⚠️ Please upload a reference MoM for better formatting.")



    # Load data and generate embeddings only once
# if __name__ == "__main__":
#     audio_path = "/content/drive/MyDrive/FYP_MoM-20250424T113508Z-001/FYP_MoM/Fyp_meeting.mp3"
#     output_json = "diarization_result.json"
#     # Load embeddings
#     embeddings = load_embeddings("/content/embeddings4")
#     diarization_result = diarize_and_transcribe(audio_path, embeddings, chunk_duration=5.0, transcribe_duration=75.0)
#     merged_result = merge_consecutive_segments(diarization_result)
#     save_diarization_result(merged_result, output_json)


Overwriting app.py


In [None]:
!pkill -9 -f ngrok  # Use -9 for forceful termination if necessary

In [None]:
!nohup streamlit run app.py --server.port 8502 --server.headless true > logs.txt 2>&1 &

In [None]:
from pyngrok import ngrok
ngrok.set_auth_token("2wAkGuxj4OJCPf0uZupbleH2FQs_7UANoLea7orq8ibFPHVnb")
public_url = ngrok.connect(8502)
print(f"Streamlit app is live at: {public_url}")

Streamlit app is live at: NgrokTunnel: "https://1d174116550f.ngrok-free.app" -> "http://localhost:8502"


In [None]:
!git config --global user.email "priyank.naik2003@gmail.com"
!git config --global user.name "PBN272003"

# Clone your repo
!git clone https://github.com/PBN272003/Propello_Transcription_Task.git
%cd Propello_Transcription_Task

# Save your work here
# For example, saving notebook:
!cp /content/Propello_assign1.ipynb .

# Commit and push
!git add Propello_assign1.ipynb
!git commit -m "Add notebook"
!git push

Cloning into 'Propello_Transcription_Task'...
/content/Propello_Transcription_Task
cp: cannot stat '/content/Propello_assign1.ipynb': No such file or directory
fatal: pathspec 'Propello_assign1.ipynb' did not match any files
On branch main

Initial commit

nothing to commit (create/copy files and use "git add" to track)
error: src refspec refs/heads/main does not match any
[31merror: failed to push some refs to 'https://github.com/PBN272003/Propello_Transcription_Task.git'
[m