# imports

In [38]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
#load data
with open("data/trending_videos.json", "r", encoding="utf-8") as f:
    payload = json.load(f)

df = pd.json_normalize(payload["collector"])
pd.set_option('display.max_columns', None)

print(df.head())


                    id                                               text  \
0  6907228749016714497                                  Confidence went 📈   
1  6875468410612993286  Quiet Zone... follow me on insta: joeysofo. Co...   
2  6898699405898059010  Iphone bend test🤗 #tiktok #viral #fyp #iphone ...   
3  6902819837345533186                                                      
4  6905635666588192002                           小技です👟✨#tiktok教室#tutorial   

   createTime                                        webVideoUrl  \
0  1608214517  https://www.tiktok.com/@ninakleij/video/690722...   
1  1600819763  https://www.tiktok.com/@joeysofo/video/6875468...   
2  1606228625  https://www.tiktok.com/@jackeyephone/video/689...   
3  1607187987  https://www.tiktok.com/@naomivaneeren/video/69...   
4  1607843600  https://www.tiktok.com/@io.dreamer_mk/video/69...   

                                            videoUrl videoUrlNoWaterMark  \
0  https://v77.tiktokcdn.com/ed1f811617d7b5e18b8d...

In [62]:
# fix garbage hashtag list from json
def flatten_hashtags(tag_list):
    if not tag_list:
        return ""
    return " ".join(f"#{d['name']}" for d in tag_list)

df["hashtags_str"] = df["hashtags"].apply(flatten_hashtags)

print(df['hashtags_str'])

0                                                       
1                                                       
2                #tiktok #viral #fyp #iphone #test #bend
3                                                       
4                                    #tiktok教室 #tutorial
                             ...                        
995                                  #foryou #foryoupage
996    #gttfg #gotothegym #swolefam #nutrition #diet ...
997                              #fy #foryoupage #foryou
998                                  #fyp #foryou #curls
999    #horserider #horsegirl #equestrian #equestrian...
Name: hashtags_str, Length: 1000, dtype: object


In [63]:
# make matrix of hashtag TF-IDF score. each row = one video, 


def hashtag_tokenizer(text):
    return text.split()

vectorizer = TfidfVectorizer(
    tokenizer=hashtag_tokenizer,
    lowercase=False,
    max_features=5000,
    token_pattern=None
)

tfidf_matrix = vectorizer.fit_transform(df["hashtags_str"])

print(tfidf_matrix.shape)

(1000, 2220)


In [64]:
# split sentences for each video to prepare to vectorize
sentences = df['text'].astype(str).fillna("").str.split().tolist()

print(sentences[:5])

[['Confidence', 'went', '📈'], ['Quiet', 'Zone...', 'follow', 'me', 'on', 'insta:', 'joeysofo.', 'Comment', 'where', 'you', 'wanna', 'see', 'me', 'blade', 'next.', 'Reply', 'to', '@dwight_schnuute'], ['Iphone', 'bend', 'test🤗', '#tiktok', '#viral', '#fyp', '#iphone', '#test', '#bend'], [], ['小技です👟✨#tiktok教室#tutorial']]


In [65]:
# train model
from gensim.models import Word2Vec

w2v = Word2Vec(
    sentences=sentences,
    vector_size=128,   # 128-D
    window=5,
    min_count=2,       # ignore very rare words
    sg=1,              # skip-gram
    epochs=5,
    workers=4
)

In [66]:
# description into a vector

import numpy as np

def sent_vec(words):
    # look up the numeric vector that Word2Vec learned for every word in this description and collect them into a list
    vecs = [w2v.wv[w] for w in words if w in w2v.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(128)

# stack arrays
desc_vecs = np.vstack([sent_vec(s) for s in sentences])   # shape (100 000, 128)


print(desc_vecs.shape)      


(1000, 128)


In [69]:
# Concatenate the two matrices
from scipy.sparse import hstack

tfidf_sparse = tfidf_matrix.astype(float)  # ensure compatible dtype
combined = hstack([tfidf_sparse, desc_vecs]).tocsr()  # still sparse

print(combined.shape)        # should be (1000, 5128)


(1000, 2348)


In [71]:
np.save("output/videos.npy", combined.astype(np.float32).toarray())