In [1]:
# Imports
print("Importing Libraries...")
import os
import warnings
import logging
import pytorch_lightning as pl
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import whisperx
import gc
import pytube
import subprocess
from pydub import AudioSegment, silence
import time
import pandas as pd
import shutil
print("Imported Libraries...")

# Segmentation Function
def play_wav_segment(file_path, audio, saved_dir, end_url, start_time_s, end_time_s):
    os.makedirs(saved_dir, exist_ok=True)
    start_time_ms = start_time_s * 1000
    end_time_ms = end_time_s * 1000
    segment = audio[start_time_ms:end_time_ms]
    nonsilent_parts = silence.detect_nonsilent(segment, min_silence_len=400, silence_thresh=-40)
    if nonsilent_parts:
        segment = segment[nonsilent_parts[0][0]:nonsilent_parts[-1][1]]
    else:
        segment = segment
    file_name = saved_dir + "/current" + "/" + end_url + "___" + str(start_time_s).split('.')[0] + "__" + str(end_time_s).split('.')[0] + ".wav"
    segment.export(file_name, format="wav")

# Cooling Mechanism
def get_gpu_temperature():
    sp = subprocess.Popen(['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out_str = sp.communicate()[0]
    return float(out_str.strip())

def wait_until_cool_down(target_temp):
    gpu_temp = get_gpu_temperature()
    while gpu_temp >= target_temp:
        print(f"GPU Temperature: {gpu_temp} °C - Cooling down...")
        time.sleep(60)
        gpu_temp = get_gpu_temperature()

    print(f"GPU Temperature cooled down to {gpu_temp} °C")

# Loading the Model
device = "cuda"
batch_size = 16
compute_type = "float16"
print("Loading Model...")
model_path = "/tf/models/huggingface/hub/models--Systran--faster-whisper-large-v2/snapshots/f0fe81560cb8b68660e564f55dd99207059c092e"
model = whisperx.load_model(model_path, device, compute_type=compute_type, language='ur')
print("Model Loaded...")

Importing Libraries...
Imported Libraries...
Loading Model...


100%|██████████████████████████████████████| 16.9M/16.9M [01:04<00:00, 273kiB/s]


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.1.2+cu118. Bad things might happen unless you revert torch to 1.x.
Model Loaded...


In [4]:
# Global Variables
video_len = 7
saved_dir = ""
end_url = ""
video_urls = list(set(pd.read_csv("video_links.csv").values.reshape(-1)))

# Main Loop
for i, video_url in enumerate(video_urls):
    print(i + 1, "Processing ", video_url, "...")
    start_main = time.time()
    
    # Downloading the video
    print("Downloading the video and Converting to .wav...")
    start = time.time()
    yt = pytube.YouTube(video_url)
    videos = yt.streams.filter(only_audio=True).first()
    videos.download()
    parent_dir = './'
    new_filename = 'video.wav'
    default_filename = videos.default_filename
    out = subprocess.run([
        'ffmpeg',
        '-y',
        '-loglevel', 'quiet',
        '-i', os.path.join(parent_dir, default_filename),
        os.path.join(parent_dir, new_filename)
    ])
    os.remove(out.args[-2])
    end = time.time()
    print("Download and Conversion Completed! in ", end - start," seconds")
    
    # Assigning values to global variables
    os.makedirs("./data", exist_ok=True)
    end_url = video_url.split("=")[1]
    name = "./data/wavs" #+ end_url
    saved_dir = name
    audio_file = 'video.wav'
    data_dir = './data'
    
    # Length of .wav file
    audio = AudioSegment.from_wav(audio_file)
    audio = audio.set_frame_rate(22050)
    video_length_s = round(len(audio) // 1000)
    video_length_s -= video_length_s % video_len
    print("Video length is ", video_length_s, " seconds")
    
    # Segmenting .wav files
    print("Segmenting the .wav files...")
    start = time.time()
    os.makedirs(saved_dir + "/current", exist_ok=True)
    for i in range(0,video_length_s,video_len):
        play_wav_segment(audio_file, audio, saved_dir, end_url, i, i + video_len)
    end = time.time()
    print("Segmentation of files Completed! in ", end - start," seconds")
    
    # Creating Dataset
    dictionary = {"File":[],"Text":[]}
    print("Creating Dataset...")
    start = time.time()
    segment_count = 0
    for file_name in os.listdir(saved_dir + "/current"):
        cut_audio_file = saved_dir + "/current" + "/" + file_name
        audio = whisperx.load_audio(cut_audio_file)
        result = model.transcribe(audio, batch_size=batch_size)
        if len(result['segments'])==0:
            continue
        dictionary['File'].append(cut_audio_file)
        dictionary['Text'].append(result['segments'][0]['text'])
        if segment_count%100==0:
            print(segment_count,"Segments Complete!")
        segment_count+=1
        
    for filename in os.listdir(saved_dir + "/current"):
        source_path = os.path.join(saved_dir + "/current", filename)
        destination_path = os.path.join(saved_dir, filename)
        shutil.move(source_path, destination_path)
        
    df = pd.DataFrame(dictionary)
    read_df = pd.read_csv(data_dir + "/" + "metadata" + ".txt", sep="|", index_col=None)
    final_df = pd.concat([read_df,df])
    final_df.to_csv(data_dir + "/" + "metadata" + ".txt", sep="|", index=False)
    end = time.time()
    print("Dataset Created in ", end - start," seconds")

    end_main = time.time()
    print(i + 1, "Processed ", video_url, " in ", end_main - start_main, "seconds")

    # Cooling System
    start = time.time()
    wait_until_cool_down(60)
    end = time.time()
    print("Program Slepth for", end - start, "seconds")

1 Processing  https://www.youtube.com/watch?v=ryg4NexZvkI&list=PLTohL7c6ncxm_Tvkvi74oZUDFDzYkJ80H&index=37&pp=iAQB ...
Downloading the video and Converting to .wav...
Download and Conversion Completed! in  10.704430103302002  seconds
Video length is  3269  seconds
Segmenting the .wav files...
Segmentation of files Completed! in  40.46542549133301  seconds
Creating Dataset...
0 Segments Complete!
100 Segments Complete!
200 Segments Complete!
300 Segments Complete!
400 Segments Complete!
Dataset Created in  474.9340670108795  seconds
3263 Processed  https://www.youtube.com/watch?v=ryg4NexZvkI&list=PLTohL7c6ncxm_Tvkvi74oZUDFDzYkJ80H&index=37&pp=iAQB  in  529.0891041755676 seconds
GPU Temperature: 76.0 °C - Cooling down...
GPU Temperature cooled down to 57.0 °C
Program Slepth for 60.074453353881836 seconds
2 Processing  https://www.youtube.com/watch?v=P1cy_s-9h5U&list=PLTohL7c6ncxm_Tvkvi74oZUDFDzYkJ80H&index=24&pp=iAQB ...
Downloading the video and Converting to .wav...
Download and Conver