# 🧉 Transcribir videos de Youtube e identificar diferentes agentes en la conversación
- Inspirado por **machinelearnear** 📺 https://www.youtube.com/c/machinelearnear

* Es requisito montar Google Drive para usarlo como storage persistente.  
Esta es la carpeta compartida: ['preguntale-al-candidato'](https://drive.google.com/drive/folders/1HKcNUU_Ws8VJnlg5O4r8WUrbuHwu9P84?usp=sharing)

### instalar dependencias

In [None]:
!pip install git+https://github.com/m-bain/whisperx.git;
!python3 -m pip install -U yt-dlp;

Collecting git+https://github.com/m-bain/whisperx.git
  Cloning https://github.com/m-bain/whisperx.git to /tmp/pip-req-build-ebevv07j
  Running command git clone --filter=blob:none --quiet https://github.com/m-bain/whisperx.git /tmp/pip-req-build-ebevv07j
  Resolved https://github.com/m-bain/whisperx.git to commit ef965a03edd42f4c61b41e0d46d35567555d3539
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyannote.audio@ git+https://github.com/pyannote/pyannote-audio@11b56a137a578db9335efc00298f6ec1932e6317 (from whisperx==3.1.1)
  Using cached pyannote.audio-2.1.1-py2.py3-none-any.whl


---
## Montar Google Drive (Compartido)

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
project_path = "drive/MyDrive/preguntale-al-candidato"

Mounted at /content/drive


---
### **Definir Token y variables**

In [None]:
hf_token = open(f"{project_path}/.creds/HF_TOKEN").read()  # https://huggingface.co/settings/tokens


In [None]:
%load_ext sql
%reload_ext sql
%sql sqlite:///drive/MyDrive/preguntale-al-candidato/sqlite/pal-db

In [None]:
%%sql
SELECT * FROM candidate_videos

 * sqlite:///drive/MyDrive/preguntale-al-candidato/sqlite/pal-db
Done.


url,candidate,added_to_list_ts,status,status_ts
https://www.youtube.com/watch?v=MZEnIUQGuDg&ab_channel=ElPelucaMilei,Milei,2023-08-25 22:22:33,PROCESSED,2023-08-26 14:25:48
https://www.youtube.com/watch?v=0q2-jGjpSk4,Milei,2023-08-26 07:00:03,PROCESSED,2023-08-26 14:25:48
https://www.youtube.com/watch?v=R1Y_5xH7-ks,Milei,2023-08-26 07:00:03,Not processed,2023-08-26 14:25:48


---
### Add links to DB (Only add new ones)

In [None]:
# creating a list of items
from datetime import datetime

new_data = [
    ('https://www.youtube.com/watch?v=0q2-jGjpSk4', 'Milei', datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'Not processed', datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
    ('https://www.youtube.com/watch?v=R1Y_5xH7-ks', 'Milei', datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'Not processed', datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
    ('https://www.youtube.com/watch?v=MZEnIUQGuDg&ab_channel=ElPelucaMilei', 'Milei', datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'Not processed', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    ]

In [None]:
import sqlite3

connection = sqlite3.connect('drive/MyDrive/preguntale-al-candidato/sqlite/pal-db') # file path

# create a cursor object from the cursor class
cur = connection.cursor()
cur.executemany("""
INSERT INTO candidate_videos VALUES (?,?,?,?,?)
ON CONFLICT (url) DO UPDATE SET status=excluded.status, status_ts=excluded.status_ts
WHERE status = 'Not processed';
""", new_data)

# committing our connection
print('Command executed successfully!!!')
connection.commit()

# close our connection
connection.close()

Command executed successfully!!!


In [None]:
%%sql
SELECT * FROM candidate_videos

 * sqlite:///drive/MyDrive/preguntale-al-candidato/sqlite/pal-db
Done.


url,candidate,added_to_list_ts,status,status_ts
https://www.youtube.com/watch?v=MZEnIUQGuDg&ab_channel=ElPelucaMilei,Milei,2023-08-25 22:22:33,PROCESSED,2023-08-26 14:25:48
https://www.youtube.com/watch?v=0q2-jGjpSk4,Milei,2023-08-26 07:00:03,PROCESSED,2023-08-26 14:25:48
https://www.youtube.com/watch?v=R1Y_5xH7-ks,Milei,2023-08-26 07:00:03,Not processed,2023-08-26 14:27:23


----
----

### Utils

In [None]:
###########                    UTILS                ###########################

import re
import subprocess
import json

def extract_video_id(url):
    video_id_match = re.findall(r"(v=|/)([a-zA-Z0-9_-]{11})", url)
    if video_id_match:
        return video_id_match[0][1]
    return None

def run_process_and_log(command: list):
    logger = logging.getLogger(__name__)
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    for line in iter(process.stdout.readline, ''):
        print(f"stdout: {line.strip()}")

    for line in iter(process.stderr.readline, ''):
        print(f"WARNING! stderr: {line.strip()}")

    process.communicate()


def identify_speaker(file_name, model, embedding_candidato):

    with open(f"{file_name}.json", "r") as file:
        transcription = json.load(file)

    stime = time.time()

    # speaker_mapper = {}

    # Compare each segment to teh sampple voice and categorize
    for segment in transcription['segments']:

        # Extract embedding for a speaker speaking between t=Xs and t=Ys
        target_speaker = Segment(segment['start'], segment['end'])
        waveform_target, sample_rate = audio.crop(f"{file_name}.wav", target_speaker)
        embedding_target = model(waveform_target[None])

        # compare embeddings using "cosine" distance
        distance = cdist(embedding_candidato, embedding_target, metric="cosine")

        segment['identity_distance'] = distance[0][0]

        # remove words. Not used
        del segment['words']

    print(f'total time: {time.time()-stime:.2f} seconds')

    return transcription['segments']

---
---
## PROCESS VIDEOS


In [None]:
#%%

###########                    MAIN                ###########################
import subprocess
import time
import torch
from datetime import datetime
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from pyannote.audio import Audio
from pyannote.core import Segment
from scipy.spatial.distance import cdist
import logging

CADIDATES = ['Milei']
TIMESTAMP_FORMAT = '%Y-%m-%d %H:%M:%S'

for candidate_name in CADIDATES:

    # Get unprocessed links for the candidate from SQLITE
    %sql data << SELECT * FROM candidate_videos WHERE status = 'Not processed' AND CANDIDATE = :candidate_name

    print(f"Retrieved {len(data)} unprocessed links for candidate {candidate_name}")

    # Initialize models
    model = PretrainedSpeakerEmbedding(
        "speechbrain/spkrec-ecapa-voxceleb",
        device=torch.device("cuda"))
    audio = Audio(sample_rate=16000, mono="downmix")
    candidato = Segment(1., 15.)

    # Get candidate sample voice from `data` directory
    waveform_candidato, sample_rate = audio.crop(f"{project_path}/data/muestras-de-voz/{candidate_name}.wav", candidato)
    embedding_candidato = model(waveform_candidato[None])

    for r in data:


        # Define URL coming from database
        url = r[0]
        # url = 'https://www.youtube.com/shorts/7iztngtoe-0'

        file_name = extract_video_id(url)
        print(f"Processing id: {file_name}  -- {url}")

        ts = datetime.now().strftime(TIMESTAMP_FORMAT)
        %sql UPDATE candidate_videos SET status = 'In progress', status_ts = :ts WHERE url=:url

        ###
        # Download Video's Audio
        yt_download_command = [ "python", "-m", "yt_dlp", "--output", f"{file_name}.%(ext)s",
                               "--extract-audio", "--audio-format", "wav", url,]

        run_process_and_log(yt_download_command)
        print(f"Audio has been downloaded to {file_name}")

        # Transcribe Audio
        whisperx_command = ["whisperx", f"{file_name}.wav", "--hf_token", hf_token,
                            "--model", "large-v2", "--language", "es",
                            "--align_model", "WAV2VEC2_ASR_LARGE_LV60K_960H",
                            "--diarize", "--min_speakers", "2", ]

        print(f"Start audio transcription....")
        run_process_and_log(whisperx_command)
        print(f"Transcription finished")

        print(f"Identifying speakers. Flag target speaker...")
        # Get transcripton and identify TARGET speaker
        segments = identify_speaker(file_name, model, embedding_candidato)

        with open(f"{project_path}/data/{candidate_name}/{file_name}.json", "w") as fh:
            json.dump(segments, fh, ensure_ascii=False, indent=4)

        print(f"Target identified")

        print("Updating DB")

        ts = datetime.now().strftime(TIMESTAMP_FORMAT)
        %sql UPDATE candidate_videos SET status = 'PROCESSED', status_ts = :ts WHERE url=:url


