In [2]:
!pip install --quiet torchaudio pydub numpy pandas scikit-learn tensorflow nltk

In [60]:
import pandas as pd
import numpy as np
import os

# Define the correct path to the training CSV file inside the unzipped structure
CORRECT_FILE_PATH = '/content/summarization_data/CSV/train.csv'

# Check if the file exists before attempting to load
if os.path.exists(CORRECT_FILE_PATH):
    print(f"✅ SUCCESS: Found training data at {CORRECT_FILE_PATH}. Loading...")

    # Load Summarization Data (Using the training subset)
    dialogsum_df = pd.read_csv(CORRECT_FILE_PATH)

    # Add validation/test data if desired, but for simplified POC, we use only train data
    # Filter columns to only keep id, dialogue, and summary, as originally intended
    dialogsum_df = dialogsum_df[['dialogue', 'summary']].dropna()

    # Sample a small subset for feasible 'from-scratch' training
    # The original plan sampled 2000 rows.
    SAMPLE_SIZE = 2000
    if len(dialogsum_df) > SAMPLE_SIZE:
        dialogsum_df = dialogsum_df.sample(n=SAMPLE_SIZE, random_state=42).reset_index(drop=True)
    else:
         dialogsum_df = dialogsum_df.reset_index(drop=True)

    print(f"DialogSum data successfully loaded and sampled to {len(dialogsum_df)} rows.")
    print("Proceed to run Colab Cell 2 (Diarization) and Colab Cell 3 (Summarization).")
    print("\nFirst 3 rows of loaded data:")
    print(dialogsum_df.head(3))

else:
    # This should not happen after the detailed debug, but serves as a final check
    print(f"❌ FATAL ERROR: The required file was not found at {CORRECT_FILE_PATH}. Please check the folder structure again.")

✅ SUCCESS: Found training data at /content/summarization_data/CSV/train.csv. Loading...
DialogSum data successfully loaded and sampled to 2000 rows.
Proceed to run Colab Cell 2 (Diarization) and Colab Cell 3 (Summarization).

First 3 rows of loaded data:
                                            dialogue  \
0  #Person1#: I heard you had got a wonderful job...   
1  #Person1#: Mr. Lin, what are you interested in...   
2  #Person1#: Hi, what will you do with your brok...   

                                             summary  
0               #Person2# does not like the new job.  
1  Mr. Lin tells #Person1# he enjoys camping, hik...  
2  #Person2#'ll throw away the broken cell phone ...  


In [61]:


# STEP 1: DATA PATHS AND FEATURE EXTRACTION (MFCCs)
# ORIGINAL: DIARIZATION_DIR = '/content/diarization_data/Mini Speaker Diarization/'
# CORRECTED PATH based on debug output:
DIARIZATION_DIR = '/content/diarization_data/dataset/'

TRAIN_DIR = os.path.join(DIARIZATION_DIR, 'train')
SR = 16000 # Sample Rate
N_MFCC = 20 # Number of MFCC features
LATENT_DIM = 256 # Define LATENT_DIM here for use in Colab Cell 3

# --- REST OF THE CODE REMAINS THE SAME ---

def extract_mfcc_embedding(audio_path):
    """Extracts MFCC features and returns a mean-pooled embedding."""
    # ... (rest of the function code)

# STEP 2: CREATE TRAINING DATASET
X_train, y_train = [], []
# This line will now correctly look inside the 'dataset/train' folder
speaker_labels = [name for name in os.listdir(TRAIN_DIR) if os.path.isdir(os.path.join(TRAIN_DIR, name))]
label_map = {speaker: i for i, speaker in enumerate(speaker_labels)}
# ... (rest of the code for dataset creation and training)
# ... (The rest of Colab Cell 2 code follows here)

In [83]:

# --- Training Setup (Requires global constants defined in Cell 1) ---
X_train, y_train = [], []

# Corrected path variables
DIARIZATION_DIR = '/content/diarization_data/dataset/'
TRAIN_DIR = os.path.join(DIARIZATION_DIR, 'train')

# Create Training Dataset
speaker_labels = [name for name in os.listdir(TRAIN_DIR) if os.path.isdir(os.path.join(TRAIN_DIR, name))]
label_map = {speaker: i for i, speaker in enumerate(speaker_labels)}

print("[MODULE 3] Starting MFCC Extraction and Diarization Model Training...")

for speaker, label_idx in label_map.items():
    speaker_path = os.path.join(TRAIN_DIR, speaker)
    for audio_file in os.listdir(speaker_path):
        if audio_file.endswith('.wav'):
            embedding = extract_mfcc_embedding(os.path.join(speaker_path, audio_file))
            if embedding is not None:
                X_train.append(embedding)
                y_train.append(label_idx)

X_train = np.array(X_train)
y_train = np.array(y_train)

# --- Train Model ---
if X_train.size > 0 and X_train.ndim == 2:
    X_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_tensor = torch.tensor(y_train, dtype=torch.long)

    INPUT_DIM = X_tensor.shape[1]
    NUM_SPEAKERS = len(label_map)
    diarization_model = SpeakerClassifier(INPUT_DIM, NUM_SPEAKERS)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(diarization_model.parameters(), lr=0.001)

    # Simple training loop
    for epoch in range(50):
        optimizer.zero_grad()
        outputs = diarization_model(X_tensor)
        loss = criterion(outputs, y_tensor)
        loss.backward()
        optimizer.step()

    # Evaluation Proxy
    with torch.no_grad():
        outputs = diarization_model(X_tensor)
        _, predicted = torch.max(outputs.data, 1)
        accuracy = (predicted == y_tensor).sum().item() / len(y_tensor)
        print(f"✅ Diarization Model Training Complete. Accuracy (Proxy for DER): {accuracy*100:.2f}%")
else:
     print("❌ Diarization Model Training Skipped (No valid training data found).")

[MODULE 3] Starting MFCC Extraction and Diarization Model Training...


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


✅ Diarization Model Training Complete. Accuracy (Proxy for DER): 100.00%


In [65]:


print("[MODULE 4] Starting Seq2Seq Summarizer Model Training...")

# Data Preprocessing/Tokenization (Data variables defined in Cell 1)
input_texts = dialogsum_df['dialogue'].astype(str).tolist()
target_texts = dialogsum_df['summary'].astype(str).apply(lambda x: 'sostok ' + x + ' eostok').tolist()

# Re-create the padded input data (required for training and model dimensions)
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
encoder_input_data = pad_sequences(input_sequences, maxlen=MAX_LEN_DIALOGUE, padding='post')

target_sequences = target_tokenizer.texts_to_sequences(target_texts)
decoder_input_data = pad_sequences(target_sequences, maxlen=MAX_LEN_SUMMARY, padding='post')

decoder_target_data = np.zeros(
    (len(decoder_input_data), MAX_LEN_SUMMARY, TARGET_VOCAB_SIZE),
    dtype='float32'
)
for i, seq in enumerate(decoder_input_data):
    for t, word_index in enumerate(seq):
        if t > 0 and word_index != 0:
            decoder_target_data[i, t-1, word_index] = 1.0


# Seq2Seq Model Architecture (Re-define to build)
encoder_inputs = Input(shape=(MAX_LEN_DIALOGUE,))
encoder_emb = Embedding(INPUT_VOCAB_SIZE, 128, mask_zero=True)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(LATENT_DIM, return_state=True)(encoder_emb)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
decoder_emb = Embedding(TARGET_VOCAB_SIZE, 128, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=encoder_states)
decoder_dense = Dense(TARGET_VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Final Training Model
summarization_training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
summarization_training_model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# Train Model
print("Starting summarization_training_model.fit()...")
summarization_training_model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=10,
    validation_split=0.1,
    verbose=1
)
print("✅ Seq2Seq Summarizer Model Training Complete.")

# ----------------------------------------------------------------------
# Build Inference Models (CRITICAL STEP for Integrated Pipeline)
# ----------------------------------------------------------------------
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(LATENT_DIM,))
decoder_state_input_c = Input(shape=(LATENT_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs2, state_h2, state_c2 = decoder_lstm(
    decoder_emb, initial_state=decoder_states_inputs
)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)

# Inference Function (required by Colab Cell 4)
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq, verbose=0)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_tokenizer.word_index['sostok']
    decoded_sentence = ''

    stop_condition = False
    reverse_target_word_index = {v: k for k, v in target_tokenizer.word_index.items()}

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_word_index.get(sampled_token_index, '')

        if sampled_word not in ('eostok', ''):
            decoded_sentence += ' ' + sampled_word

        if (sampled_word == 'eostok' or len(decoded_sentence.split()) > MAX_LEN_SUMMARY):
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()

print("✅ Milestone 2 Summarizer components are ready. Proceed to run Colab Cell 4.")

[MODULE 4] Starting Seq2Seq Summarizer Model Training...
Starting summarization_training_model.fit()...
Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 3s/step - loss: 7.9013 - val_loss: 5.7854
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 3s/step - loss: 5.9399 - val_loss: 5.6746
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 3s/step - loss: 5.8291 - val_loss: 5.6459
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 3s/step - loss: 5.8161 - val_loss: 5.6165
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 3s/step - loss: 5.7845 - val_loss: 5.5976
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 3s/step - loss: 5.7867 - val_loss: 5.6005
Epoch 7/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 3s/step - loss: 5.7526 - val_loss: 5.5729
Epoch 8/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76

In [87]:

# Dependencies are assumed to be installed and models defined from preceding cells.
import os
import time
import base64
from IPython.display import Javascript, display
from google.colab.output import eval_js # Keep this import for eval_js

AUDIO_FILE_NAME = 'live_meeting_audio.wav'

# --- CORRECTED record_audio FUNCTION ---
def record_audio(filename='audio.wav', duration_sec=15):
    """Captures audio from the microphone using a stable Colab JS/Python bridge."""

    # 1. HTML/JS to set up status and buttons
    js_setup = """
        const statusDiv = document.createElement('div');
        statusDiv.id = 'recording-status';
        statusDiv.innerHTML = 'Status: Ready...';
        document.body.appendChild(statusDiv);
    """
    display(Javascript(js_setup))
    print(f"Recording for {duration_sec} seconds... PRESS STOP AT ANY TIME.")

    # 2. JS function for recording and sending data - ***FIXED SYNTAX HERE***
    js_code = f"""
        (async function() {{ // IMMEDIATELY INVOKED ASYNC FUNCTION WRAPPER
            document.getElementById('recording-status').innerHTML = 'Status: Recording...';

            // --- Core Audio Capture Logic ---
            // FIX: 'await' is now valid inside this async function
            const audio = await navigator.mediaDevices.getUserMedia({{audio: true}});
            const mediaRecorder = new MediaRecorder(audio);
            const audioChunks = [];

            mediaRecorder.ondataavailable = e => {{ audioChunks.push(e.data); }};

            // Promise to handle the stop event and return base64 data
            const recordingPromise = new Promise(resolve => {{
                mediaRecorder.onstop = () => {{
                    document.getElementById('recording-status').innerHTML = 'Status: Processing...';
                    const audioBlob = new Blob(audioChunks, {{'type': 'audio/ogg; codecs=opus'}});
                    const fileReader = new FileReader();
                    fileReader.readAsDataURL(audioBlob);
                    fileReader.onloadend = () => {{
                        const base64data = fileReader.result.split(',')[1];
                        resolve(base64data);
                    }}
                }};
            }});

            mediaRecorder.start();

            let timer = setTimeout(() => {{
                if (mediaRecorder.state !== 'inactive') {{
                    mediaRecorder.stop();
                    document.getElementById('recording-status').innerHTML = 'Status: Timeout (Stopped).';
                }}
            }}, {duration_sec * 1000});

            // UI button to stop early
            const button = document.createElement('button');
            button.innerHTML = 'STOP RECORDING';
            button.onclick = () => {{
                if (mediaRecorder.state !== 'inactive') {{
                    mediaRecorder.stop();
                    clearTimeout(timer);
                    document.getElementById('recording-status').innerHTML = 'Status: User Stopped.';
                }}
            }};
            document.body.appendChild(button);

            // Return the base64 data to Python
            return recordingPromise;
        }})() // INVOKE THE ASYNC FUNCTION
    """

    # Execute the JS code and wait for the base64 data
    base64_data = eval_js(js_code)

    # Save the file using Python
    audio_data = base64.b64decode(base64_data)
    with open(filename, 'wb') as f:
      f.write(audio_data)

    print(f"\n✅ Audio saved to {filename}")


# --- RERUN EXECUTION BLOCK ---
print("--- Step 1: Capturing Live Audio (M1) ---")
try:
    record_audio(AUDIO_FILE_NAME, duration_sec=15)
    # The rest of the Colab Cell 4 logic follows here: STT, Diarization, Summarization
except Exception as e:
    print(f"❌ Critical Error during audio capture (Python): {e}")

--- Step 1: Capturing Live Audio (M1) ---


<IPython.core.display.Javascript object>

Recording for 15 seconds... PRESS STOP AT ANY TIME.

✅ Audio saved to live_meeting_audio.wav


In [88]:

import os
import torch
import re
from transformers import pipeline
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --- Ensure AUDIO_FILE_NAME is defined from the previous cell ---
AUDIO_FILE_NAME = 'live_meeting_audio.wav'

# --- FFMPEG Conversion (Prepares the file for Whisper STT) ---
print("\n--- Running FFMPEG Conversion ---")
# FFMPEG is required to convert the browser's OGG audio to a 16kHz WAV format
!ffmpeg -i {AUDIO_FILE_NAME} -ar 16000 -ac 1 temp_16k_mono.wav -y
print("✅ Conversion to temp_16k_mono.wav complete.")

# --- STT (M1 Component) ---
print("\n--- Step 2.1: Transcription (STT) ---")
# Load the Whisper pipeline
whisper_pipeline = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-tiny.en",
    device=0 if torch.cuda.is_available() else -1
)
raw_transcript = whisper_pipeline("temp_16k_mono.wav")['text'].strip()
print(f"Raw Transcript: {raw_transcript}")

# --- Diarization/Alignment (M2 Component) ---
# NOTE: Using MOCK Diarization for stability
def mock_diarization(transcript):
    turns = re.split(r'[.?!]', transcript)
    output = ""
    speakers = ["Speaker A", "Speaker B", "Speaker C"]
    for i, turn in enumerate(turns):
        if turn.strip():
            # Add period back after the turn
            output += f"[{speakers[i % len(speakers)]}]: {turn.strip()}."
    return output

diarized_transcript = mock_diarization(raw_transcript)
print("\n--- Diarized Transcript (Mock Alignment) ---")
print(diarized_transcript)

# --- Summarization (M2 Component) ---
print("\n--- Step 3: Summarization ---")
try:
    # Tokenize the new diarized input (Requires global tokenizers from Cell 1/3)
    new_input_sequence = input_tokenizer.texts_to_sequences([diarized_transcript])
    new_encoder_input = pad_sequences(new_input_sequence, maxlen=MAX_LEN_DIALOGUE, padding='post')

    # Generate the summary (Requires decode_sequence function from Cell 3)
    final_summary = decode_sequence(new_encoder_input)

    print("✅ Final Pipeline Complete.")
    print("---------------------------------------")
    print(f"FINAL SUMMARY: {final_summary}")
    print("---------------------------------------")

except NameError as e:
    print(f"❌ ERROR: Inference models/variables not found: {e}")
    print("Ensure all setup cells (1-3) were run COMPLETELY to define input_tokenizer, decode_sequence, etc.")
except Exception as e:
    print(f"❌ An unexpected error occurred during summarization: {e}")


--- Running FFMPEG Conversion ---
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enab

Device set to use cpu


Raw Transcript: The stale smell of cold beer fingers. It takes heat to bring out the odor. In cold dip restores health.

--- Diarized Transcript (Mock Alignment) ---
[Speaker A]: The stale smell of cold beer fingers.[Speaker B]: It takes heat to bring out the odor.[Speaker C]: In cold dip restores health.

--- Step 3: Summarization ---
✅ Final Pipeline Complete.
---------------------------------------
FINAL SUMMARY: person1 the
---------------------------------------
