In [1]:
import requests
import dotenv
import os, time
import pandas as pd
import glob, json

dotenv.load_dotenv()

True

In [2]:
scratch = os.getenv('SCRATCH')
hashtag_folder = f"{scratch}/tiktok/hashtag_data"

oak = os.getenv('OAK')

video_folder = f"{oak}/samori/tiktok/videos"
comments_folder = f"{oak}/samori/tiktok/comments"
comments_folder2 = f"{scratch}/samori/tiktok/comments"
os.path.exists(comments_folder)

True

In [23]:
def get_access_token(client_key, client_secret):
    
    endpoint_url = "https://open.tiktokapis.com/v2/oauth/token/"
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
    }

    data = {
        'client_key': client_key,
        'client_secret': client_secret,
        'grant_type': 'client_credentials'
    }

    response = requests.post(endpoint_url, headers=headers, data=data)

    if response.status_code == 200:
        response_json = response.json()
        return response_json
        
    else:
        # If the request was not successful, print the error response JSON
        print("Error:", response.json())

In [24]:
def query_comments_api(query_body, query_params, headers, save_folder):
    endpoint_name = "comments"
    endpoint = "https://open.tiktokapis.com/v2/research/video/comment/list/"
    
    video_id = query_body["video_id"]
    prev_cursor = query_body["cursor"]
    
    # make post request
    response = requests.post(endpoint, json=query_body, params=query_params, headers=headers)
    status_code = response.status_code

    if response.status_code != 200:
        try:
            response_json = response.json()
        except json.JSONDecodeError as e:
            response_json = "Error reading json"
            
        return video_id, False, prev_cursor, response_json, status_code 

    
    # extracting information for pagination
    data = response.json().get("data", {})
    
    has_more = data["has_more"]
    cursor = data["cursor"]

    # saving queried data
    records = data.get(endpoint_name, [])
    df = pd.DataFrame(records)
    if len(df) != 0:
        df.to_csv(f"{save_folder}/{endpoint_name}_{video_id}_{cursor}.csv", index=False)

    return video_id, has_more, cursor, len(df), status_code

In [56]:
def query_comments_api_paginate(fields, cursor, video_id, save_folder, log_name, credentials, log_dir="./logs_comments_cron_download"):
    
    access_token = credentials["access_token"]
    token_type = credentials["token_type"]
    
    query_params = {"fields": fields}
    query_body = {
        "video_id":video_id, 
        "max_count":100, "cursor":cursor
    }
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"{token_type} {access_token}"
    }
    
    has_more = True

    # pagination loop
    while has_more:
        query_body.update({"cursor": cursor})
            
        video_id, has_more, cursor, samples, status_code = query_comments_api(query_body, query_params, headers, save_folder)

        with open(f"{log_dir}/{log_name}", "a") as f:
            f.write(f"{cursor}\t{has_more}\t{samples}\t{video_id}\t{status_code}\n")

        # print(f"{cursor}\t{has_more}\t{samples}\t{video_id}\t{status_code}")
        time.sleep(5)
        
    return status_code

In [55]:
def get_comments(video_ids, fields, cursor, save_folder, log_dir="./logs_comments_cron_download"):

    client_key = os.getenv("CLIENT_KEY")
    client_secret = os.getenv("CLIENT_SECRET")
    
    credentials = get_access_token(client_key, client_secret)

    logs = glob.glob(f'{log_dir}/*.txt')
    log_name = f"comments_cron_download_{len(logs)+1}.txt"

    col_names = ["cursor", "has_more","samples","video_id","status_code"]
    downloaded_vid_ids = []
    for log in logs:
        log_data = pd.read_csv(log, sep='\t', names=col_names, header=None)
        log_ids = list(log_data['video_id'])
        log_ids = [str(i) for i in log_ids]
        
        downloaded_vid_ids += log_ids
    downloaded_vid_ids = list(set(downloaded_vid_ids))

    status_code = 200
    i = 0
    for video_id in video_ids:
        if str(video_id) not in downloaded_vid_ids:
            if status_code != 429:
                if status_code == 401:
                    credentials = get_access_token(client_key, client_secret)
                    status_code = query_comments_api_paginate(fields, cursor, video_id, save_folder, log_name, credentials)
                else:
                    status_code = query_comments_api_paginate(fields, cursor, video_id, save_folder, log_name, credentials)
                i += 1
            else:
                break
        
        if i == 10:
            break

In [3]:
oak = os.getenv('OAK')
video_folder = f"{oak}/samori/tiktok/videos"
comments_folder = f"{oak}/samori/tiktok/comments"

video_data = pd.read_csv(f"{video_folder}/combined/all_months.csv")
video_ids = list(video_data['id'])


fields = "id,video_id,text,like_count,reply_count,parent_comment_id,create_time"
cursor = 0
save_folder = f"{comments_folder}/downloads"

# get_comments(video_ids, fields, cursor, save_folder)

In [20]:
df = pd.read_csv("./logs_comments_cron_download/comment_cron_download_1.txt", sep='\t', names=["cursor", "has_more","samples","video_id","status_code"], header=None)
type(df['video_id'][0])

numpy.int64

In [5]:
video_data['id'].nunique()

52047

In [11]:
log_files = glob.glob("./logs_download_comments_slurm/*.txt")

all_logs = pd.DataFrame()

for file in log_files:
    df = pd.read_csv(file, sep='\t', names=["cursor", "has_more","samples","video_id","status_code"], header=None)
    all_logs = pd.concat([all_logs, df], axis = 0)

all_logs

Unnamed: 0,cursor,has_more,samples,video_id,status_code
0,2,False,2,7397530974424239391,200
1,2,False,2,7397527779362884907,200
2,0,False,0,7397527531559144746,200
3,1,False,1,7397524502109768990,200
4,0,False,"{'error': {'code': 'invalid_params', 'message'...",7397523042047380779,400
...,...,...,...,...,...
997,501,False,37,7202616852357352747,200
998,8,False,8,7202615002048498990,200
999,4,False,4,7202613846291402027,200
1000,0,False,0,7202610878707928366,200


In [12]:
all_logs['video_id'].nunique()

21289