# Full Data Enrichment Pipeline Demo

Downloads the tiktok video, runs it thru the AudioProcessor and VideoProcessor. <br/>
Same as the cli.py

In [1]:
import sys
import os
import logging
import json
from dotenv import load_dotenv

# Change cwd to the DataEnrichment root folder
data_enrichment_root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
os.chdir(data_enrichment_root_path)

if data_enrichment_root_path not in sys.path:
    sys.path.append(data_enrichment_root_path)

load_dotenv()

import db
from worker import process
from api import video_features

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
db_conn = db.get_connection()

INFO - [EnrichmentWorker] - Successfully Connected to Database


In [3]:
# The execution will fail if the video_id is also not present in the "videos" table in the database
# because its a foreign key constraint that links "video_features" to "videos"
# So just make sure the video_is is also in "videos"

video_id = 7526924411681656086
process(video_id, db_conn)

INFO - [EnrichmentWorker] - Starting data enrichment process for video_id: 7526924411681656086
INFO - [EnrichmentWorker] - Audio 7526924411681656086 is already downloaded in /Users/rolandteslaru/Desktop/ZeruelNet/packages/DataEnrichment/tmp/audio/7526924411681656086.wav. Skipping
INFO - [EnrichmentWorker] - Starting transcriber model ggml-large-v3.bin for audio in /Users/rolandteslaru/Desktop/ZeruelNet/packages/DataEnrichment/tmp/audio/7526924411681656086.wav
INFO - [EnrichmentWorker] - Loading model cardiffnlp/twitter-roberta-base-sentiment-latest for language en
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification mo

In [None]:
analysis_result_json = VideoProcessor.process(video_path, transcript, sentiment_results, description)
analysis_result_json

In [None]:
video_ids = [
    7524257236559039767,
    7504369259968171286,
    7528368800568331542,
    7520490141178285334,
    7508992497897540886,
]

for vid_id in video_ids:
    process(vid_id, db_conn)

In [3]:
def get_all_video_ids_from_db(conn):
    VIDEO_IDS_QUERY = "SELECT video_id FROM public.videos"
    try:
        with conn.cursor() as cur:
            cur.execute(VIDEO_IDS_QUERY)

            results = cur.fetchall()
            video_ids = [item[0] for item in results]
            return video_ids
    except Exception as e:
        logging.error(f"Failed to fetch video IDs directly from database: {e}")
        return []
all_videos_ids = get_all_video_ids_from_db(db_conn)

In [8]:
excluded_video_ids = [
    7508959243270130966,
    7528342682695896328,
    7528106266921651478,
    7527409370611617031,
    7078746205030780162,
    7489163965919055159,
    7528355978962701591,
    7528389380721904918,
    7527745729327107383,
    7528105592339090710
]

filtered_ids = [vid for vid in all_videos_ids if vid not in excluded_video_ids]

In [4]:
failed_videos = video_features.get_by_status(db_conn, "failed")

In [5]:
failed_videos

['7527601571677441302',
 '7477187105769147670',
 '7516538050994834710',
 '7525069106249354518',
 '7528042018719812886',
 '7528183959482846482',
 '7528337207082470678',
 '7528197825977470239',
 '7528363882931326230',
 '7528092762961480982',
 '7527737023310859542',
 '7528203437725601055',
 '7527722095715962167',
 '7528327128786668823',
 '7503650001327901974',
 '7508959243270130966',
 '7528342682695896328',
 '7528106266921651478',
 '7527409370611617031',
 '7078746205030780162',
 '7538880118765194510']

In [6]:
for video_id in failed_videos:
    process(video_id, db_conn)

INFO - [EnrichmentWorker] - Starting data enrichment process for video_id: 7527601571677441302
INFO - [EnrichmentWorker] - Downloading MP4 for 7527601571677441302 → /Users/rolandteslaru/Desktop/ZeruelNet/packages/DataEnrichment/tmp/video/7527601571677441302.mp4
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
INFO - [EnrichmentWorker] - Successfully extracted audio to: /Users/rolandteslaru/Desktop/ZeruelNet/packages/DataEnrichment/tmp/audio/7527601571677441302.wav
INFO - [EnrichmentWorker] - Starting transcriber model ggml-large-v3.bin for audio in /Users/rolandteslaru/Desktop/ZeruelNet/packages/DataEnrichment/tmp/audio/7527601571677441302.wav
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling pa