# Get Videos List

In [1]:
import sqlite3
import pandas as pd

connection = sqlite3.connect('../videos_database.db')
cursor = connection.cursor()

cursor.execute('''
select *
from videos_metadata_full vm
--where vm.video_id not in (
--    select distinct vmf.video_id
--    from videos_metadata_features vmf
--)
''')
column_names = [description[0] for description in cursor.description]
video_data = pd.DataFrame(cursor.fetchall(), columns=column_names)
print('Number of videos before dropping duplicates: {}'.format(video_data.shape[0]))
video_data = video_data.drop_duplicates(subset=['video_id'])
print('Number of videos after dropping duplicates: {}'.format(video_data.shape[0]))

Number of videos before dropping duplicates: 5084
Number of videos after dropping duplicates: 5084


In [2]:
video_data

Unnamed: 0,video_id,createTime,playCount,commentCount,collectCount,shareCount,diggCount,author_nickname,author_id,author_uniqueid,...,music_musicAuthorName,music_title,music_applemusic,music_meta_song_id,music_platform,music_song_id,music_platforms,suggest_words,stickers,stickers_types
0,7488037420731059499,1743444631,29300000,16700,293000,447700,5200000,Sammy,6700308861284582405,samanthakuruc,...,Abrilly,original sound,0.0,,,,,,,
1,7476682375007685931,1740800798,22100000,15800,123000,41800,1600000,Cale Brown,6807286701384680453,calebrownn,...,50 Cent,Candy Shop,1.0,6733116688348219393,1.0,48504172,2.0,,,
2,7479460137074347310,1741447543,21300000,30900,36800,96100,974100,LosNegrete,6776434903045587973,losnegretefam,...,LosNegrete,original sound,0.0,,,,,,,
3,7474948817524149546,1740397258,32200000,6159,35300,11500,1200000,Natti Natasha,6519601846326137856,nattinatasha,...,NATTI NATASHA,Desde Hoy,1.0,7449569005020530705,1.0,1786010934,2.0,,,
4,7475915526070439214,1740622255,22100000,10200,347800,136900,3400000,reddek.mccaul,7091788427524342826,reddek.mccaul,...,reddek.mccaul,original sound,0.0,,,,,stubborn love guitar tutorial+++,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5079,7488143895705144622,1743469396,1300000,2924,32800,125600,385900,Hyte,6936303135849726981,hytebrand,...,Hyte,original sound,1.0,7427821692125480976,1.0,1775029106,2.0,,,
5080,7507401898375318790,1747953221,1000000,421,17800,3301,76800,nevaaadaa,6584048047255601158,nevaaadaa,...,nevaaadaa,original sound,1.0,7429044813260277776,1.0,1770465315,2.0,,,
5081,7485286417833151786,1742804178,1000000,20,988,407,3742,FUNCAT-Global,7310141169371808814,funcat_global,...,SUSAKI BEATS,MILLION DOLLAR 4.0,1.0,7416426878259660816,1.0,1769714608,2.0,,,
5082,7496164203254828318,1745336771,1700000,231,2721,549,33200,MAGICJOHN.OFFICIAL,7310483997769860142,magicjohn.official,...,MAGICJOHN.OFFICIAL,original sound,0.0,,,,,,,


In [3]:
# video_data = video_data.sample(200)

# Creating Features

## Date Features

In [4]:
import numpy as np

def create_date_features(df_in):
    df_out = df_in.copy()
    df_out['createTime'] = pd.to_datetime(df_out['createTime'])  
    df_out['createTime_hour'] = df_out['createTime'].dt.hour
    df_out['createTime_weekday'] = df_out['createTime'].dt.dayofweek
    return df_out

## Video Features

In [5]:
import cv2
import numpy as np
import pytesseract
from skimage.measure import shannon_entropy

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# Настройки для оптимизации
TESSERACT_CONFIG = '--oem 1 --psm 6'
FRAMES_FOR_DOMINANT_COLORS = 5
CUT_DETECTION_THRESHOLD = 0.8  # Порог для определения склейки
SKIP_FRAMES = 2  # Пропуск кадров для ускорения обработки

def extract_video_features(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError(f"Не удалось открыть видеофайл: {video_path}")
    
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = total_frames / fps
    
    brightness_values = []
    motion_values = []
    cut_count = 0
    prev_frame = None
    prev_hist = None
    text_presence = 0
    frame_count = 0
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        frame_count += 1
        if frame_count % (SKIP_FRAMES + 1) != 0:
            continue  # Пропускаем кадры для ускорения
        
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        
        # Яркость
        brightness = cv2.mean(gray)[0]
        brightness_values.append(brightness)
        
        # Оптический поток
        if prev_frame is not None:
            # Уменьшаем разрешение для ускорения расчета оптического потока
            small_prev = cv2.resize(prev_frame, None, fx=0.5, fy=0.5)
            small_curr = cv2.resize(gray, None, fx=0.5, fy=0.5)
            
            flow = cv2.calcOpticalFlowFarneback(
                small_prev, small_curr, None, 
                pyr_scale=0.5, levels=3, winsize=10, 
                iterations=2, poly_n=5, poly_sigma=1.1, flags=0
            )
            motion = np.mean(np.abs(flow))
            motion_values.append(motion)
        
        # Детекция склеек (упрощенный метод)
        hist = cv2.calcHist([gray], [0], None, [64], [0, 256])  # Меньше бинов для скорости
        cv2.normalize(hist, hist)  # Нормализация для сравнения
        
        if prev_hist is not None:
            diff = cv2.compareHist(prev_hist, hist, cv2.HISTCMP_CORREL)
            if diff < CUT_DETECTION_THRESHOLD:
                cut_count += 1
        
        prev_hist = hist
        prev_frame = gray
    
    cap.release()
    
    result = {
        "fps": fps,
        "avg_brightness": np.mean(brightness_values) if brightness_values else 0,
        "avg_motion": np.mean(motion_values) if motion_values else 0,
        "cut_rate": cut_count / duration if duration > 0 else 0,
        "entropy": shannon_entropy(gray) if 'gray' in locals() else 0,
    }
    
    return result

## Audio Features

### Audio Classification

In [6]:
# import mediapipe as mp
# from mediapipe.tasks import python
# from mediapipe.tasks.python import audio
# import numpy as np
# from scipy.io import wavfile

# def classifiy_audio(audio_path, threshold=0.3):
#     model_path = '1.tflite'
#     AudioClassifier = mp.tasks.audio.AudioClassifier
#     AudioClassifierOptions = mp.tasks.audio.AudioClassifierOptions
#     AudioRunningMode = mp.tasks.audio.RunningMode
#     BaseOptions = mp.tasks.BaseOptions
    
#     options = AudioClassifierOptions(
#         base_options=BaseOptions(model_asset_path=model_path),
#         max_results=5,
#         running_mode=AudioRunningMode.AUDIO_CLIPS,
#         score_threshold=threshold
#     )
    
#     AudioData = mp.tasks.components.containers.AudioData
    
#     sample_rate, buffer = wavfile.read(audio_path)
#     audio_data = AudioData.create_from_array(
#         buffer.astype(float) / np.iinfo(np.int16).max, sample_rate)
    
#     with AudioClassifier.create_from_options(options) as classifier:
#         audio_classifier_result_list = classifier.classify(audio_data)
        
#     audio_classes = []

#     for res in audio_classifier_result_list:
#         for category in res.classifications[0].categories:
#             audio_classes.append(category.category_name)
            
#     return np.unique(audio_classes).tolist()

### Music or Non-Music Classifiaction

In [7]:
from transformers import pipeline
import librosa

model_name = 'MarekCech/GenreVim-Music-Detection-DistilHuBERT'
pipe = pipeline('audio-classification', model=model_name, device=0, batch_size=18)

# def classify_music_non_music(audio_path, pipe):
#     audio, rate = librosa.load(audio_path)
#     return pipe(audio)[0]['label']

# define data streamer
def data_stream(samples):
    for i in range(len(samples)):
        # try:
        audio, rate = librosa.load('../parsing/audio_from_videos/' + samples[i] + '.mp3')
        chunk_duration = 20
        chunk_samples = int(chunk_duration * rate)
        chunks = [audio[i:i + chunk_samples] for i in range(0, len(audio), chunk_samples)]
        chunks = sorted(chunks, key=lambda x: x.shape[0], reverse=True)
        yield chunks[0]
        # yield audio
        # except:
        #     print('not_found')
        #     pass

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0


In [8]:
# video_data = video_data.reset_index(drop=True)

In [9]:
# audio, rate = librosa.load('../parsing/audio_from_videos/' + video_data.loc[5, 'video_id'] + '.wav')

In [10]:
# # chunk duration 2 seconds
# chunk_duration = 10
# chunk_samples = int(chunk_duration * rate)
# chunks = [audio[i:i + chunk_samples] for i in range(0, len(audio), chunk_samples)]

In [11]:
# import time

# ress = []
# t1 = time.perf_counter()
# for result in pipe.predict(data_stream(video_data['video_id'].tolist())):
#     ress.append(result)
# t2 = time.perf_counter()
# print('Worktime: {:.2f} seconds'.format(t2-t1))

### Other Audio Features

In [12]:
import numpy as np
import librosa
import pandas as pd
import subprocess
import os

def calculate_snr(y, sr):
    # Разделяем сигнал и шум (например, через высокочастотный фильтр)
    S = librosa.stft(y)
    magnitude = np.abs(S)
    noise = np.median(magnitude)  # Простая оценка шума
    signal = np.max(magnitude)    # Оценка сигнала
    snr = 10 * np.log10(signal / noise) if noise > 0 else 100  # в dB
    return snr

def calculate_harmonicity(y):
    harmonic, percussive = librosa.effects.hpss(y)
    harmonic_ratio = np.sum(harmonic**2) / (np.sum(y**2) + 1e-6)
    return harmonic_ratio

def calculate_dynamic_range(y):
    dyn_range = np.max(y) - np.min(y)
    return dyn_range

def calculate_spectral_centroid(y, sr):
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    return np.mean(centroid)

def detect_clipping(y, threshold=0.99):
    clipping_samples = np.sum(np.abs(y) > threshold * np.max(np.abs(y)))
    clipping_ratio = clipping_samples / len(y)
    return clipping_ratio

def extract_audio_features(audio_path):
    y, sr = librosa.load(audio_path)
    # duration = librosa.get_duration(y=y, sr=sr)
    
    # Темп и ритм
    tempo = librosa.beat.tempo(y=y, sr=sr)[0]
    
    # MFCC (мел-кепстральные коэффициенты)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_avg = np.mean(mfcc, axis=1)
    
    # Громкость
    rms = librosa.feature.rms(y=y).mean()
    
    chunk_duration = 5
    chunk_samples = int(chunk_duration * sr)
    chunks = [y[i:i + chunk_samples] for i in range(0, len(y), chunk_samples)]
    chunks = sorted(chunks, key=lambda x: x.shape[0], reverse=True)
    y = chunks[0]
    snr = calculate_snr(y, sr)
    harmonicity = calculate_harmonicity(y)
    dyn_range = calculate_dynamic_range(y)
    centroid = calculate_spectral_centroid(y, sr)
    clipping = detect_clipping(y)
    
    quality_score = (
        0.3 * snr +
        0.2 * harmonicity +
        0.2 * dyn_range +
        0.1 * centroid +
        0.2 * clipping
    )
    
    return {
        "tempo": tempo,
        "loudness": rms,
        **{f"mfcc_{i+1}": mfcc_avg[i] for i in range(13)},
        "quality_score": quality_score,
        "snr": snr,
        "harmonicity": harmonicity,
        "dyn_range": dyn_range,
        "centroid": centroid,
        "clipping": clipping
    }

# Features Extracting

In [14]:
import time
import torch
import warnings as w
w.filterwarnings('ignore')

# Date Features
t1 = time.perf_counter()
video_data = create_date_features(video_data)
t2 = time.perf_counter()
print('Date Features Worktime: {:.2f} seconds for {} videos'.format(t2-t1, len(video_data)))

video_data = video_data.reset_index(drop=True)

# Video Features
t1 = time.perf_counter()
video_features = pd.DataFrame()
video_list_in_formatted_videos = [x.replace('.mp4', '') for x in os.listdir('../parsing/formatted_videos') if '.mp4' in x]
for vid in video_data['video_id'].unique().tolist():
    try:
        if vid not in video_list_in_formatted_videos:
            continue
        to_append = pd.DataFrame.from_dict(
            extract_video_features("../parsing/formatted_videos/{}.mp4".format(vid)),
            orient='index'
        ).T
        to_append['video_id'] = vid
        video_features = pd.concat([video_features, to_append], axis=0)
    except:
        pass

t2 = time.perf_counter()
print('Video Features Worktime: {:.2f} seconds for {} videos'.format(t2-t1, len(video_data)))

# Music/Non Music Classification
music_non_music = pd.DataFrame()
t1 = time.perf_counter()
audio_list_in_audio_from_videos = [x.replace('.mp3', '') for x in os.listdir('../parsing/audio_from_videos') if '.mp3' in x]

audios_to_predict = video_data['video_id'].unique().tolist()
ser = pd.Series(audios_to_predict)
audios_to_predict = ser[ser.isin(audio_list_in_audio_from_videos)].tolist()
# audios_to_predict = list(set(audio_list_in_audio_from_videos) - set(audios_to_predict))
audio_number = 0
for result in pipe.predict(data_stream(audios_to_predict)):
    # ress.append(result[0]['label'])
    # try:
    to_append = pd.DataFrame([result[0]['label']])
    to_append.columns = ['music_non_music']
    to_append['video_id'] = audios_to_predict[audio_number]
    music_non_music = pd.concat([music_non_music, to_append])
    audio_number += 1
    # except:
    #     print("Can't predict audio music/non-music")
    #     pass
    
torch.cuda.empty_cache()
t2 = time.perf_counter()
print('Music/Non-Music Classification Worktime: {:.2f} seconds for {} videos'.format(t2-t1, len(video_data)))


# Other Audio Features
all_audio_features = pd.DataFrame()
t1 = time.perf_counter()

for aud in video_data['video_id'].unique().tolist():
    try:
        if aud not in audio_list_in_audio_from_videos:
            continue
        res = extract_audio_features('../parsing/audio_from_videos/'+aud+'.mp3')
        to_append = pd.DataFrame.from_dict(res, orient='index').T
        to_append['video_id'] = aud
        all_audio_features = pd.concat([all_audio_features, to_append])
    except:
        pass

t2 = time.perf_counter()
print('Other Audio Features Worktime: {:.2f} seconds for {} videos'.format(t2-t1, len(video_data)))

try:
    all_features = video_features.merge(
        all_audio_classes_pv, 
        on=['video_id'], 
        how='outer').merge(
        all_audio_features, 
        on=['video_id'], 
        how='outer'
    )
except:
    pass

try:
    all_features = video_features.merge(
        music_non_music, 
        on=['video_id'], 
        how='outer').merge(
        all_audio_features, 
        on=['video_id'], 
        how='outer'
    )
except:
    pass
# all_features = all_features.fillna(0)

Date Features Worktime: 0.01 seconds for 5084 videos
Video Features Worktime: 995.52 seconds for 5084 videos
Music/Non-Music Classification Worktime: 984.28 seconds for 5084 videos
Other Audio Features Worktime: 1932.65 seconds for 5084 videos


Video Features Worktime: 983.19 seconds for 5084 videos

In [19]:
video_data_ = video_data.merge(all_features, on='video_id')

ser_null = video_data_.isnull().sum()
ser_null[ser_null>0]

video_theme                    45
parsingTime                    45
video_subtitles_languages    2553
video_subtitles_avg_size     2553
claInfo_enableAutoCaption       5
claInfo_hasOriginalAudio        5
claInfo_noCaptionReason       746
music_authorName                6
music_duration                  3
music_album                  4103
music_applemusic                6
music_platform               3188
music_platforms              3188
dtype: int64

In [20]:
print('Row Numbers before dropna: {}'.format(video_data_.shape[0]))

Row Numbers before dropna: 5068


In [17]:
video_data_ = video_data_.dropna()

In [18]:
print('Row Numbers after dropna: {}'.format(video_data_.shape[0]))

Row Numbers after dropna: 166


In [21]:
print('--- Uploading Data to the Database ---')
# Write DataFrame to SQLite table
video_data_.to_sql('videos_metadata_all_features', connection, if_exists='append', index=False)
print('--- Data Successfully Uploaded ---')

--- Uploading Data to the Database ---
--- Data Successfully Uploaded ---
