# 1. Install Required Packages
This cell installs all the necessary Python packages for audio processing, speaker diarization, transcription, summarization, and document creation.  
It ensures your environment has everything needed for the pipeline to work.

In [None]:
!pip install torch librosa noisereduce soundfile yt-dlp pyannote.audio transformers faster-whisper python-docx nltk

^C


Collecting torch
  Downloading torch-2.7.0-cp311-cp311-win_amd64.whl.metadata (29 kB)
Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting noisereduce
  Downloading noisereduce-3.0.3-py3-none-any.whl.metadata (14 kB)
Collecting soundfile
  Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Collecting yt-dlp
  Downloading yt_dlp-2025.5.22-py3-none-any.whl.metadata (174 kB)
Collecting pyannote.audio
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting transformers
  Downloading transformers-4.52.3-py3-none-any.whl.metadata (40 kB)
Collecting faster-whisper
  Downloading faster_whisper-1.1.1-py3-none-any.whl.metadata (16 kB)
Collecting filelock (from torch)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.

ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\Sultan Khalid\\AppData\\Local\\Temp\\pip-unpack-vwn_xkst\\torch-2.7.0-cp311-cp311-win_amd64.whl'
Check the permissions.



In [12]:
!pip install voicefixer


Collecting voicefixer
  Downloading voicefixer-0.1.3-py3-none-any.whl.metadata (10 kB)
Collecting progressbar (from voicefixer)
  Downloading progressbar-2.5.tar.gz (10 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting torchlibrosa (from voicefixer)
  Downloading torchlibrosa-0.1.0-py3-none-any.whl.metadata (3.5 kB)
Collecting GitPython (from voicefixer)
  Downloading GitPython-3.1.44-py3-none-any.whl.metadata (13 kB)
Collecting streamlit>=1.12.0 (from voicefixer)
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting altair<6,>=4.0 (from streamlit>=1.12.0->voicefixer)
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.5.0 (from streamlit>=

# 2. Setup & Imports
Import all required libraries for audio processing, diarization, transcription, summarization, and document handling.  
Also, download the NLTK punkt tokenizer for sentence splitting.

In [4]:
import os
import torch
import librosa
from faster_whisper import WhisperModel
import noisereduce as nr
import soundfile as sf
import yt_dlp
from pyannote.audio import Pipeline
# from transformers import pipeline as hf_pipeline
import re
from transformers import pipeline,AutoTokenizer, AutoModelForSeq2SeqLM
from docx import Document
import subprocess
import nltk
import shutil
nltk.download("punkt")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to C:\Users\Sultan
[nltk_data]     Khalid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 3. Configuration
Set up all configuration variables, including YouTube URL, file paths, device selection (CPU/GPU), and your HuggingFace token for model access.

In [5]:


# === Configuration ===
YOUTUBE_URL = "https://youtu.be/48-62pf9pVU?si=d03GtoJFEhekoXEU"
AUDIO_PATH = "audio.wav"
CLEAN_AUDIO_PATH = "audio_denoised.wav"
TRANSCRIPT_PATH = "transcript_whisper_large.txt"
SUMMARY_PATH = "summary"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HF_TOKEN = "HF_TOKEN"  # Replace with your HF token


# 4. Extract Audio from YouTube
This function downloads audio from the specified YouTube URL and saves it as a WAV file using `yt-dlp` and `ffmpeg`.

In [None]:
def extract_audio(youtube_url, output_wav_path="audio.wav"):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': 'downloaded_audio.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
        'postprocessor_args': ['-ar', '16000'],
        'prefer_ffmpeg': True,
    }
    ffmpeg_location = r'PATH_TO_FFMPEG'

    ydl_opts['ffmpeg_location'] = ffmpeg_location # Set ffmpeg_location at the top level

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])

    if os.path.exists("downloaded_audio.wav"):
        os.rename("downloaded_audio.wav", output_wav_path)
    else:
        raise FileNotFoundError("FFmpeg postprocessing failed to create WAV file.")

# 5. Denoise Audio
This function uses the `noisereduce` library to remove background noise from the downloaded audio, improving transcription quality.

In [1]:
def denoise_audio(input_path, output_path):
    # Load audio
    y, sr = librosa.load(input_path, sr=16000)

    # Estimate noise and reduce
    reduced_noise = nr.reduce_noise(y=y, sr=sr)

    # Save denoised audio
    sf.write(output_path, reduced_noise, sr)


# 6. Speaker Diarization
This function uses the `pyannote.audio` pipeline to identify and segment different speakers in the audio file.

In [5]:
def run_diarization(audio_path, hf_token):
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        use_auth_token=hf_token
    )
    pipeline.to(DEVICE)
    return pipeline({'audio': audio_path})

# 7. Transcription with Arabic Whisper
This function uses the `faster-whisper` model to transcribe the denoised audio into Arabic text, segmenting by time.

In [6]:
def transcribe_audio(audio_path):
    model = WhisperModel("medium", device="cpu", compute_type="int8")

    segments, info = model.transcribe(
        audio_path,
        language="ar",
        beam_size=5,
        vad_filter=True
    )

    full_text = ""
    all_segments = []

    for segment in segments:
        text = segment.text.strip()
        full_text += text + " "
        all_segments.append({
            "start": segment.start,
            "end": segment.end,
            "text": text
        })

    return {"text": full_text.strip(), "segments": all_segments}

# 8. Save Transcript with Diarization
This function saves the transcript to a text file, including speaker labels and timestamps for each segment.

In [7]:
def save_diarized_transcript(transcript, diarization_result, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        for segment in transcript['segments']:
            start, end, text = segment['start'], segment['end'], segment['text']
            speaker = "Unknown"
            for turn, _, label in diarization_result.itertracks(yield_label=True):
                if turn.start <= start <= turn.end:
                    speaker = label
                    break
            f.write(f"[{start:.2f} - {end:.2f}] {speaker}: {text.strip()}\n")

# 9. Summarize Transcript and Save as TXT & DOCX
This function summarizes the transcript using an Arabic summarization model, then saves the summary as both a `.txt` and `.docx` file.

In [2]:
def summarize_transcript(input_path, output_path_base):
    try:
        model_name = "moussaKam/AraBART"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(DEVICE)

        with open(input_path, "r", encoding="utf-8") as f:
            transcript = f.read().strip()

        # 🧹 Clean transcript
        cleaned_transcript = re.sub(r"\[\d+\.\d+\s*-\s*\d+\.\d+\]\s*(SPEAKER_\d+|Unknown):", "", transcript)
        cleaned_transcript = re.sub(r"\s{2,}", " ", cleaned_transcript).strip()

        def generate_summary(prompt_text, max_len=100):
            input_ids = tokenizer.encode(prompt_text, return_tensors="pt", max_length=1024, truncation=True).to(DEVICE)
            summary_ids = model.generate(
                input_ids,
                max_length=max_len,
                min_length=30,
                num_beams=4,
                length_penalty=1.5,
                no_repeat_ngram_size=3,
                early_stopping=True
            )
            return tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)

        print("✍️ Generating concise مقدمة...")
        intro_prompt = f"اكتب مقدمة مختصرة للنص التالي:\n{cleaned_transcript}"
        intro_summary = generate_summary(intro_prompt, max_len=80)

        print("✍️ Generating concise نقاط رئيسية...")
        bullet_prompt = f"استخرج أهم النقاط باختصار وبجمل قصيرة:\n{cleaned_transcript}"
        bullet_summary = generate_summary(bullet_prompt, max_len=120)

        print("✍️ Generating concise خاتمة...")
        conclusion_prompt = f"اكتب خلاصة مختصرة للنص:\n{cleaned_transcript}"
        conclusion_summary = generate_summary(conclusion_prompt, max_len=80)

        # ✏️ Save as .txt
        txt_path = output_path_base + ".txt"
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write("### ملخص الفيديو\n\n")
            f.write("🟢 مقدمة:\n" + intro_summary + "\n\n")
            f.write("📌 نقاط رئيسية:\n" + bullet_summary + "\n\n")
            f.write("🔚 خاتمة:\n" + conclusion_summary + "\n")

        # 📄 Save as .docx
        doc_path = output_path_base + ".docx"
        doc = Document()
        doc.add_heading("ملخص الفيديو", level=1)

        doc.add_heading("مقدمة", level=2)
        doc.add_paragraph(intro_summary)

        doc.add_heading("نقاط رئيسية", level=2)
        for point in bullet_summary.split("،"):
            doc.add_paragraph(point.strip(), style='List Bullet')

        doc.add_heading("خاتمة", level=2)
        doc.add_paragraph(conclusion_summary)

        doc.save(doc_path)

        print(f"✅ Concise summary saved to: {txt_path}")
        print(f"✅ Concise summary saved to: {doc_path}")

    except Exception as e:
        print(f"⚠️ Error during summarization: {e}")



## 10–15. Pipeline Execution: From Extraction to Summarization

These cells execute the full speech-to-text and summarization pipeline step by step:

1. **Extract Audio:**  
   Downloads and saves the audio from the specified YouTube video as a WAV file.

2. **Denoise Audio:**  
   Cleans the downloaded audio by removing background noise, improving transcription quality.

3. **Speaker Diarization:**  
   Identifies and segments different speakers in the denoised audio using a diarization model.

4. **Transcription:**  
   Converts the diarized, denoised audio into Arabic text using the Whisper model.

5. **Save Diarized Transcript:**  
   Saves the transcript to a text file, including speaker labels and timestamps for each segment.

6. **Summarize and Save:**  
   Summarizes the transcript using an Arabic summarization model, then saves the summary as both a `.txt` and `.docx` file for easy sharing and reading.

> **Tip:**  
> Run these cells in order to process your audio and generate both transcript and summary files automatically.

In [23]:
extract_audio(YOUTUBE_URL, AUDIO_PATH)


[youtube] Extracting URL: https://youtu.be/48-62pf9pVU?si=d03GtoJFEhekoXEU
[youtube] 48-62pf9pVU: Downloading webpage
[youtube] 48-62pf9pVU: Downloading tv client config
[youtube] 48-62pf9pVU: Downloading tv player API JSON
[youtube] 48-62pf9pVU: Downloading ios player API JSON
[youtube] 48-62pf9pVU: Downloading m3u8 information
[info] 48-62pf9pVU: Downloading 1 format(s): 251
[download] downloaded_audio.webm has already been downloaded
[download] 100% of   57.45MiB
[ExtractAudio] Destination: downloaded_audio.wav
Deleting original file downloaded_audio.webm (pass -k to keep)


In [14]:
denoise_audio(AUDIO_PATH, CLEAN_AUDIO_PATH)


✅ Voice enhanced audio saved to: audio_denoised.wav


In [12]:
diarization = run_diarization(CLEAN_AUDIO_PATH, HF_TOKEN)


  if ismodule(module) and hasattr(module, '__file__'):
It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

  std = sequences.std(dim=-1, correction=1)


In [9]:
transcript = transcribe_audio(CLEAN_AUDIO_PATH)


In [13]:
save_diarized_transcript(transcript, diarization, TRANSCRIPT_PATH)


In [14]:
summarize_transcript(TRANSCRIPT_PATH, SUMMARY_PATH)


✍️ Generating concise مقدمة...
✍️ Generating concise نقاط رئيسية...
✍️ Generating concise خاتمة...
✅ Concise summary saved to: summary.txt
✅ Concise summary saved to: summary.docx
