In [4]:
HOME_DIR = "/root/dev/vcmr" # 현재 디렉토리

import os
os.chdir(HOME_DIR)
from yt_dlp import YoutubeDL
import subprocess
from PIL import Image
import torch
import clip
import numpy as np
import sys
from argparse import Namespace
import pandas as pd

df = pd.read_csv("./Spotify_Youtube.csv")

sys.path.append(f"{HOME_DIR}/lp-music-caps/lpmc/music_captioning")

ydl_opts = {
    'format': 'bestvideo[height<=360]+bestaudio/best[height<=360]/worst',
    'quiet': True,
    'noplaylist': True,
    'skip_download': True,
}

segments = [
    {
        'start': '00:00:30',
        'duration': '10',
        'segment_num': 1,
        'frame_time': '00:00:35',
    },
    {
        'start': '00:01:20',
        'duration': '10',
        'segment_num': 2,
        'frame_time': '00:01:25',
    },
    {
        'start': '00:02:10',
        'duration': '10',
        'segment_num': 3,
        'frame_time': '00:02:15',
    },
]

os.chdir(f"{HOME_DIR}/lp-music-caps/lpmc/music_captioning")
from captioning import captioning, evaluate, evaluate_batch

args = Namespace(
    gpu=0,
    framework="pretrain",
    caption_type="lp_music_caps",
    max_length=128,
    num_beams=5,
    model_type="pretrain",
    audio_path=""
)

captioner = captioning(args)
captioner = captioner.cuda()
os.chdir(HOME_DIR)

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()
print()


  pretrained_object = torch.load(f'{save_dir}/{model_types}.pth', map_location='cpu')





In [5]:

def process_url(url):
    try:
        image_embeddings = []
        audio_embeddings = []
        video_id = url.split('=')[-1]

        print("fetch video info")
        with YoutubeDL(ydl_opts) as ydl:
            info_dict = ydl.extract_info(url, download=False)
            if 'requested_formats' in info_dict:
                video_url = info_dict['requested_formats'][0]['url']
                audio_url = info_dict['requested_formats'][1]['url']
            else:
                video_url = info_dict['url']
                audio_url = info_dict['url']

        print("download segments")
        tags = []
        for segment in segments:
            segment_num = segment['segment_num']
            tag = f"{video_id}_{segment_num}"
            tags.append(tag)
            audio_output = f"./{tag}.mp3"
            image_output = f"./{tag}.jpg"
            audio_cmd = [ 'ffmpeg', '-y', '-ss', segment['start'], '-t', segment['duration'], '-i',
                        audio_url, '-vn', '-acodec', 'libmp3lame', '-ar', '44100', '-ac', '2', audio_output]
            audio_result = subprocess.run(audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            image_cmd = ['ffmpeg', '-y', '-ss', segment['frame_time'], '-i', video_url, '-frames:v', '1', image_output]
            image_result = subprocess.run(image_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

        print("processing images")
        for tag in tags:
            image = Image.open(f"./{tag}.jpg")
            image_input = preprocess(image).unsqueeze(0).to(device)
            image_features = model.encode_image(image_input)
            image_embeddings.append(image_features)

        print("processing text")
        for tag in tags:
            os.chdir(f"{HOME_DIR}/lp-music-caps/lpmc/music_captioning")
            args.audio_path = f"{HOME_DIR}/{tag}.mp3"
            inference = evaluate(captioner, args)
            os.chdir(HOME_DIR)
            text = clip.tokenize(inference[0], truncate=True).to(device)
            with torch.no_grad():
                audio_features = model.encode_text(text).cpu().numpy()
            audio_embeddings.append(audio_features)

        return audio_embeddings
    except Exception as e:
        print(e)
        return None, None


In [6]:
process_url("https://www.youtube.com/watch?v=ZncbtRo7RXs")

fetch video info
download segments
processing images
processing text


[array([[-0.1783 ,  0.383  , -0.3162 , ..., -0.2053 , -0.017  ,  0.066  ],
        [-0.04016,  0.0729 , -0.3345 , ..., -0.408  ,  0.1815 ,  0.1562 ]],
       dtype=float16),
 array([[-0.1783 ,  0.383  , -0.3162 , ..., -0.2053 , -0.017  ,  0.066  ],
        [-0.04016,  0.0729 , -0.3345 , ..., -0.408  ,  0.1815 ,  0.1562 ]],
       dtype=float16),
 array([[-0.1783 ,  0.383  , -0.3162 , ..., -0.2053 , -0.017  ,  0.066  ],
        [-0.04016,  0.0729 , -0.3345 , ..., -0.408  ,  0.1815 ,  0.1562 ]],
       dtype=float16)]