In [83]:
import json
import pandas as pd
import os
import numpy as np

filenames = ['videos_metadata_json/' + x for x in os.listdir('videos_metadata_json/') if '.json' in x]

In [84]:
data = []

for file_path in filenames:
    with open(file_path, 'r') as f:
        file_data = json.loads(f.read())
    data.extend(file_data)

In [85]:
len(data)

10682

In [86]:
data[0]['video']['claInfo']

{'enableAutoCaption': True, 'hasOriginalAudio': False, 'noCaptionReason': 3}

In [87]:
import time

t1 = time.perf_counter()

videos = {}

for video in data:
    # t1 = time.perf_counter()
    video_id = video['id']
    # if video_id in already_downloaded['video_id'].tolist():
    #     continue
    # if 'imagePost' in video.keys():
    #     time.sleep(1)
    #     continue
    videos[video_id] = {}
    
    # Video
    videos[video_id]['createTime'] = video['createTime']
    videos[video_id]['playCount'] = video['stats']['playCount']
    videos[video_id]['commentCount'] = video['stats']['commentCount']
    videos[video_id]['collectCount'] = video['stats']['collectCount']
    videos[video_id]['shareCount'] = video['stats']['shareCount']
    videos[video_id]['diggCount'] = video['stats']['diggCount']
    
    # Author
    videos[video_id]['author_nickname'] = video['author']['nickname']
    videos[video_id]['author_id'] = video['author']['id']
    videos[video_id]['author_uniqueid'] = video['author']['uniqueId']
    videos[video_id]['author_verified'] = video['author']['verified']
    
    if 'commentSetting' in video['author'].keys():
        videos[video_id]['author_commentSetting'] = video['author']['commentSetting']
    else:
        videos[video_id]['author_commentSetting'] = np.nan
    if 'downloadSetting' in video['author'].keys():
        videos[video_id]['author_downloadSetting'] = video['author']['downloadSetting']
    else:
        videos[video_id]['author_downloadSetting'] = np.nan
    if 'duetSetting' in video['author'].keys():
        videos[video_id]['author_duetSetting'] = video['author']['duetSetting']
    else:
        videos[video_id]['author_duetSetting'] = np.nan
        
    videos[video_id]['author_signature'] = video['author']['signature']
    videos[video_id]['author_followerCount'] = video['authorStats']['followerCount']
    videos[video_id]['author_followingCount'] = video['authorStats']['followingCount']
    videos[video_id]['author_videoCount'] = video['authorStats']['videoCount']
    videos[video_id]['author_heartCount'] = video['authorStats']['heartCount']
    videos[video_id]['author_diggCount'] = video['authorStats']['diggCount']
    
    # VideoTechStats
    videos[video_id]['video_duration'] = video['video']['duration']
    if video['video']['duration'] > 250:
        time.sleep(1)
        continue
    try:
        videos[video_id]['video_volumeInfo_Loudness'] = video['video']['volumeInfo']['Loudness']
    except:
        videos[video_id]['video_volumeInfo_Loudness'] = np.nan
    try:
        videos[video_id]['video_volumeInfo_Peak'] = video['video']['volumeInfo']['Peak']
    except:
        videos[video_id]['video_volumeInfo_Peak'] = np.nan
    if 'video_theme_button' in video.keys():
        videos[video_id]['video_theme'] = video['video_theme_button']
    else:
        videos[video_id]['video_theme'] = np.nan
    if 'parsingTime' in video.keys():
        videos[video_id]['parsingTime'] = video['parsingTime']
    else:
        videos[video_id]['parsingTime'] = np.nan
        
    videos[video_id]['video_url'] = 'https://www.tiktok.com/@/video/{}'.format(video_id)
    videos[video_id]['video_urlDownload'] = video['video']['playAddr']
    videos[video_id]['video_categoryType'] = video['CategoryType']
    # videos[video_id]['video_desc'] = video['contents'][0]['desc']
    videos[video_id]['video_desc2'] = video['desc']
    videos[video_id]['video_text_content'] = ''
    if 'contents' in video.keys():
        for content in video['contents']:
            videos[video_id]['video_text_content'] = videos[video_id]['video_text_content'] + ' ' + content['desc']
    videos[video_id]['video_hashtags'] = ''
    if 'challenges' in video.keys():
        for challenge in video['challenges']:
            videos[video_id]['video_hashtags'] = videos[video_id]['video_hashtags'] + ' ' + challenge['title']
    # try:
    #     videos[video_id]['video_hashtags'] = video['contents'][1]['desc']
    # else:
    #     videos[video_id]['video_hashtags'] = ''
    subtitle_languages = []
    subtitle_sizes = []
    if 'subtitleInfos' in video['video'].keys():
        for subtitle in video['video']['subtitleInfos']:
            subtitle_languages.append(subtitle['LanguageID'])
            subtitle_sizes.append(subtitle['Size'])
            
        videos[video_id]['video_subtitles_languages'] = len(np.unique(subtitle_languages))
        videos[video_id]['video_subtitles_avg_size'] = np.mean(subtitle_sizes)
    else:
        videos[video_id]['video_subtitles_languages'] = np.nan
        videos[video_id]['video_subtitles_avg_size'] = np.nan
        
    if 'claInfo' in video['video'].keys():
        videos[video_id]['claInfo_enableAutoCaption'] = video['video']['claInfo']['enableAutoCaption']
        videos[video_id]['claInfo_hasOriginalAudio'] = video['video']['claInfo']['hasOriginalAudio']
        if 'noCaptionReason' in video['video']['claInfo'].keys():
            videos[video_id]['claInfo_noCaptionReason'] = video['video']['claInfo']['noCaptionReason']
        else:
            videos[video_id]['claInfo_noCaptionReason'] = np.nan
    else:
        videos[video_id]['claInfo_enableAutoCaption'] = np.nan
        videos[video_id]['claInfo_hasOriginalAudio'] = np.nan
        videos[video_id]['claInfo_noCaptionReason'] = np.nan
        
    
    if 'duetDisplay' in video.keys():
        videos[video_id]['video_duetDisplay'] = video['duetDisplay']
    else:
        videos[video_id]['video_duetDisplay'] = np.nan
    if 'forFriend' in video.keys():
        videos[video_id]['video_forFriend'] = video['forFriend']
    else:
        videos[video_id]['video_forFriend'] = np.nan
    if 'isAd' in video.keys():
        videos[video_id]['video_isAd'] = video['isAd']
    else:
        videos[video_id]['video_isAd'] = np.nan
    if 'itemCommentStatus' in video.keys():
        videos[video_id]['video_itemCommentStatus'] = video['itemCommentStatus']
    else:
        videos[video_id]['video_itemCommentStatus'] = np.nan
    if 'officalItem' in video.keys():
        videos[video_id]['officalItem'] = video['officalItem']
    else:
        videos[video_id]['officialItem'] = np.nan
    if 'originalItem' in video.keys():
        videos[video_id]['originalItem'] = video['originalItem']
    else:
        videos[video_id]['originalItem'] = np.nan
    if 'shareEnabled' in video.keys():
        videos[video_id]['shareEnabled'] = video['shareEnabled']
    else:
        videos[video_id]['shareEnabled'] = np.nan
        
    videos[video_id]['textLanguage'] = video['textLanguage']
    videos[video_id]['VQScore'] = video['video']['VQScore']
    videos[video_id]['bitrate'] = video['video']['bitrate']
    videos[video_id]['definition'] = video['video']['definition']
    videos[video_id]['ratio'] = video['video']['ratio']
    videos[video_id]['video_quality'] = video['video']['videoQuality']
    
    # Music
    if 'authorName' in video['music'].keys():
        videos[video_id]['music_authorName'] = video['music']['authorName']
    else:
        videos[video_id]['music_authorName'] = np.nan
    if 'duration' in video['music'].keys():
        videos[video_id]['music_duration'] = video['music']['duration']
    else:
        videos[video_id]['music_duration'] = np.nan
    videos[video_id]['music_isCopyrighted'] = video['music']['isCopyrighted']
    videos[video_id]['music_original'] = video['music']['original']
    videos[video_id]['music_id'] = video['music']['id']
    if 'authorName' in video['music'].keys() and 'title' in video['music'].keys():
        if 'album' in video['music'].keys():
            videos[video_id]['music_album'] = video['music']['album']
        else:
            videos[video_id]['music_album'] = np.nan
        videos[video_id]['music_musicAuthorName'] = video['music']['authorName']
        videos[video_id]['music_title'] = video['music']['title']
        videos[video_id]['music_applemusic'] = 0
        if 'tt2dsp' in video['music'].keys():
            try:
                videos[video_id]['music_meta_song_id'] = video['music']['tt2dsp']['tt_to_dsp_song_infos'][0]['meta_song_id']
                videos[video_id]['music_platform'] = video['music']['tt2dsp']['tt_to_dsp_song_infos'][0]['platform']
                videos[video_id]['music_song_id'] = video['music']['tt2dsp']['tt_to_dsp_song_infos'][0]['song_id']
                videos[video_id]['music_platforms'] = len(video['music']['tt2dsp']['tt_to_dsp_song_infos'])
                if 'token' in video['music']['tt2dsp']['tt_to_dsp_song_infos'][0].keys():
                    videos[video_id]['music_applemusic'] = 1
            except:
                videos[video_id]['music_meta_song_id'] = ''
                videos[video_id]['music_platform'] = np.nan
                videos[video_id]['music_song_id'] = ''
                videos[video_id]['music_platforms'] = np.nan
        else:
            videos[video_id]['music_meta_song_id'] = ''
            videos[video_id]['music_platform'] = np.nan
            videos[video_id]['music_song_id'] = ''
            videos[video_id]['music_platforms'] = np.nan
    else:
        videos[video_id]['music_album'] = ''
        videos[video_id]['music_musicAuthorName'] = ''
        videos[video_id]['music_title'] = ''
        videos[video_id]['music_meta_song_id'] = ''
        videos[video_id]['music_platform'] = np.nan
        videos[video_id]['music_song_id'] = ''
        videos[video_id]['music_platforms'] = np.nan
        
    videos[video_id]['suggest_words'] = ''
    if 'videoSuggestWordsList' in video.keys():
        for words in video['videoSuggestWordsList']['video_suggest_words_struct']:
            for word in words['words']:
                videos[video_id]['suggest_words'] = videos[video_id]['suggest_words'] + word['word'] + '+++'
    
    videos[video_id]['stickers'] = ''
    videos[video_id]['stickers_types'] = ''
    if 'stickersOnItem' in video.keys():
        for sticker in video['stickersOnItem']:
            full_sticker_text = ''
            for sticker_text in sticker['stickerText']:
                full_sticker_text = full_sticker_text + sticker_text + '---'
            videos[video_id]['stickers'] = videos[video_id]['stickers'] + full_sticker_text + '+++'
            videos[video_id]['stickers_types'] = videos[video_id]['stickers_types'] + str(sticker['stickerType']) + '+++'
            
t2 = time.perf_counter()
print('Worktime: {:.2f} seconds'.format(t2-t1))

Worktime: 23.52 seconds


In [88]:
final_data = pd.DataFrame.from_dict(videos, orient='index')
final_data['parsingTime'] = pd.to_datetime(final_data['parsingTime'].str.replace('_', ''), format='%Y%m%d%H%M')
final_data.index.name = 'video_id'
final_data = final_data.reset_index()

In [89]:
final_data.shape

(5084, 65)

In [90]:
import sqlite3
import pandas as pd
import numpy as np

connection = sqlite3.connect('../videos_database.db')
cursor = connection.cursor()

cursor.execute('drop table videos_metadata_full')
connection.commit()

In [91]:
final_data.to_sql('videos_metadata_full', connection, if_exists='append', index=False)

5084