In [1]:
import requests
import dotenv
import os, time
import pandas as pd

dotenv.load_dotenv()

True

In [2]:
scratch = os.getenv('SCRATCH')
hashtag_folder = f"{scratch}/tiktok/hashtag_data"

oak = os.getenv('OAK')
video_folder = f"{oak}/samori/tiktok/test_folder2"
os.path.exists(video_folder)

True

In [3]:
def get_access_token(client_key, client_secret):
    
    endpoint_url = "https://open.tiktokapis.com/v2/oauth/token/"
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
    }

    data = {
        'client_key': client_key,
        'client_secret': client_secret,
        'grant_type': 'client_credentials'
    }

    response = requests.post(endpoint_url, headers=headers, data=data)

    if response.status_code == 200:
        response_json = response.json()
        return response_json
        
    else:
        # If the request was not successful, print the error response JSON
        print("Error:", response.json())

In [4]:
def make_request_video(endpoint, query_body, query_params, headers, save_folder):
    start_date = query_body["start_date"]
    end_date = query_body["end_date"]
    prev_cursor = query_body["cursor"]
    
    if prev_cursor > 0:
        prev_search_id = query_body["search_id"]

    # make post request
    response = requests.post(endpoint, json=query_body, params=query_params, headers=headers)

    # checking if request was successful
    if response.status_code != 200:
        i = 0
        # retrying request 10 times if request was unsuccessful
        while i < 10 and response.status_code != 200:
            response = requests.post(endpoint, json=query_body, params=query_params, headers=headers)
            time.sleep(5)
            i += 1
        # increasing cursor to see if there will be some luck
        while response.status_code == 500:
            current_cursor =  query_body['cursor']
            query_body.update({"cursor": current_cursor+10})
            response = requests.post(endpoint, json=query_body, params=query_params, headers=headers)
            time.sleep(5)
            
        if response.status_code != 200:
            status_code = response.status_code
            print("Error code", response.status_code)
            print(response.json())
            return query_body["search_id"], False, query_body["cursor"], str(response.json()), status_code

    status_code = response.status_code
    
    # extracting information for pagination
    data = response.json().get("data", {})
    
    has_more = data["has_more"]
    cursor = data["cursor"]

    if not "search_id" in list(data.keys()) and prev_cursor > 0:
        search_id = prev_search_id
    else:
        search_id = data["search_id"]
        
    # if len(data['videos']) != 0 and : # note: update to be compatible with other endpoints
    #     search_id = data["search_id"]
    # else:
    #     search_id = -1

    # saving queried data
    videos = data.get("videos", []) # note: update to be compatible with other endpoints
    df = pd.DataFrame(videos)
    df.to_csv(f"{save_folder}/videos_{start_date}_{end_date}_{cursor}_{search_id}.csv", index=False)

    return search_id, has_more, cursor, len(df), status_code

In [5]:
# def query_api(endpoint_name=endpoint_name, fields=fields, 
#               query=query, cursor=cursor, start_date=start_date, 
#               end_date=end_date, save_folder=video_folder):
    
def query_api_video(endpoint_name, fields, query, cursor, 
              start_date,end_date, save_folder):
    
    # getting credentials
    client_key = os.getenv("CLIENT_KEY")
    client_secret = os.getenv("CLIENT_SECRET")
    
    credentials = get_access_token(client_key, client_secret)
    access_token = credentials["access_token"]
    token_type = credentials["token_type"]

    # note: update this to have url of other endpoints
    endpoints_dict = {"video": "https://open.tiktokapis.com/v2/research/video/query/"}
    
    query_params = {"fields": fields}
    query_body = {"query":query, "max_count":100, "cursor":cursor,
                  "start_date":str(start_date), "end_date":str(end_date)}
    headers = {"Content-Type": "application/json",
               "Authorization": f"{token_type} {access_token}"}
    
    endpoint = endpoints_dict[endpoint_name]
    has_more = True

    # pagination loop
    while has_more:
        if cursor != 0:
            query_body.update({"search_id": search_id, 
                               "cursor": cursor})
        search_id, has_more, cursor, samples, status_code = make_request_video(endpoint, query_body, query_params, headers, save_folder)

        with open(f"./download_logs_{start_date}_{end_date}_{search_id}", "a") as f:
            f.write(f"{cursor}\t{has_more}\t{samples}\t{search_id}\t{status_code}\n")

        print(f"{cursor}\t{has_more}\t{samples}\t{search_id}\t{status_code}")
        time.sleep(5)
        
    return status_code

In [6]:
formal_terms = ["fentanyl", "opioids", "opiates", "morphine", "codeine", "oxycodone", "oxymorphone", "mscontin", "percocet"]
informal_terms = ["sublimaze", "duragesic", "fentanil", "sufentanil", "fentanylum", "fentora", "thebaine", "codiene", "roxanol", 
                     "kadian", "oxycontin", "roxicodone", "roxicet", "endocet", "endocodone", "oxyir", "oxynorm", "hydrocodone", 
                     "vicodinsublimaze", "duragesic", "fentanil", "sufentanil", "fentanylum", "fentora", "thebaine", "codiene", 
                     "roxanol", "kadian", "oxycontin", "roxicodone", "roxicet", "endocet", "endocodone", "oxyir", "oxynorm", 
                     "hydrocodone", "vicodin"]

algospeak_terms = ["paink!ller", "f3nt@nol", "cod3in3", "c0d0n3", "f3nt4nol", "p@1nk!ller", "oxy80s", "0xy80", "m0rph!n3", "m3rph0n3", 
                   "m0rf3n", "c0d3in", "0xyc", "s!zzurp", "0pana", "0xym0rph0ne", "num0rph@n", "m0rf33n"]

In [7]:
video_folder = f"{oak}/samori/tiktok/test_folder2"
endpoint_name = 'video'
fields = "id,video_description,create_time,region_code,share_count,view_count,like_count,comment_count,music_id,hashtag_names,username,effect_ids,playlist_id,voice_to_text,is_stem_verified,favorites_count,video_duration"
all_opioid_keywords = formal_terms + informal_terms + algospeak_terms
normal_keywords = formal_terms + informal_terms

cursor = 0
# start_date = "20240401"
# end_date = "20240430"


query = {
    "and": [
        {
            "operation":"IN",
            "field_name":"keyword",
            "field_values":normal_keywords
        },
        {
            "operation":"EQ",
            "field_name":"region_code",
            "field_values":["US"]
        }
        
    ]
}

In [8]:
def get_str_date(month, start_day, end_day):
    month_str = '0'+str(month) if month<10 else str(month)
    start_day_str = '0'+str(start_day) if start_day<10 else str(start_day)
    end_day_str = '0'+str(end_day) if end_day<10 else str(end_day)

    return month_str, start_day_str, end_day_str

In [10]:
end_day_dict = {
    1:31, 2:28, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31
}
months = list(range(1,13))

In [11]:
start_end_pairs = {month:[(1,15),(15,end_day_dict[month])] for month in months}
start_end_pairs

{1: [(1, 15), (15, 31)],
 2: [(1, 15), (15, 28)],
 3: [(1, 15), (15, 31)],
 4: [(1, 15), (15, 30)],
 5: [(1, 15), (15, 31)],
 6: [(1, 15), (15, 30)],
 7: [(1, 15), (15, 31)],
 8: [(1, 15), (15, 31)],
 9: [(1, 15), (15, 30)],
 10: [(1, 15), (15, 31)],
 11: [(1, 15), (15, 30)],
 12: [(1, 15), (15, 31)]}

In [12]:
status_code = 200
year=2022

for month in months:
    for start, end in start_end_pairs[month]:
        if status_code != 429:
            month_str, start_day_str, end_day_str = get_str_date(month, start, end)
            
            start_date = f"{year}{month_str}{start_day_str}"
            end_date = f"{year}{month_str}{end_day_str}"
            
            print(f"---- {start_date} to {end_date}----")
            
            status_code = query_api_video(endpoint_name=endpoint_name, fields=fields, 
                  query=query, cursor=cursor, start_date=start_date, 
                  end_date=end_date, save_folder=video_folder)
        print('\n')

---- 20220101 to 20220115----
Error code 400
{'error': {'code': 'invalid_params', 'message': 'Should search from index 0 when no search id is provided', 'log_id': '2024090302433311DC9BEC0D0828AB3CEC'}}


KeyError: 'search_id'

In [49]:
status_code = 200
year=2022

for month in months:
    for start, end in start_end_pairs[month]:
        if status_code != 429:
            month_str, start_day_str, end_day_str = get_str_date(month, start, end)
            
            start_date = f"{year}{month_str}{start_day_str}"
            end_date = f"{year}{month_str}{end_day_str}"
            
            print(f"---- {start_date} to {end_date}----")
            
            status_code = query_api_video(endpoint_name=endpoint_name, fields=fields, 
                  query=query, cursor=cursor, start_date=start_date, 
                  end_date=end_date, save_folder=video_folder)
        print('\n')

---- 20231101 to 20231115----
100	True	64	7410230044255654958	200
200	True	54	7410230044255654958	200
300	True	67	7410230044255654958	200
400	True	59	7410230044255654958	200
500	True	66	7410230044255654958	200
600	True	72	7410230044255654958	200
700	True	75	7410230044255654958	200
800	True	72	7410230044255654958	200
900	True	62	7410230044255654958	200
1000	True	68	7410230044255654958	200
1100	True	59	7410230044255654958	200
1200	True	59	7410230044255654958	200
1300	True	67	7410230044255654958	200
1400	True	70	7410230044255654958	200
1500	True	60	7410230044255654958	200
1600	True	74	7410230044255654958	200
1700	True	80	7410230044255654958	200
1800	True	70	7410230044255654958	200
1900	True	70	7410230044255654958	200
2000	True	61	7410230044255654958	200
2100	True	58	7410230044255654958	200
2200	True	69	7410230044255654958	200
2300	True	53	7410230044255654958	200
2400	True	53	7410230044255654958	200
2500	True	64	7410230044255654958	200
2600	True	65	7410230044255654958	200
2700	True	66	7410

KeyError: 'search_id'

In [34]:
status_code = 200
year=2023

months = list(end_day_dict.keys())

for month in months:
    for start, end in start_end_pairs[month]:
        if status_code != 429:
            month_str, start_day_str, end_day_str = get_str_date(month, start, end)
            
            start_date = f"{year}{month_str}{start_day_str}"
            end_date = f"{year}{month_str}{end_day_str}"
            
            print(f"---- {start_date} to {end_date}----")
            
            status_code = query_api_video(endpoint_name=endpoint_name, fields=fields, 
                  query=query, cursor=cursor, start_date=start_date, 
                  end_date=end_date, save_folder=video_folder)
        print('\n')

---- 20230101 to 20230115----
100	True	72	7410204094582985771	200
200	True	65	7410204094582985771	200
300	True	73	7410204094582985771	200
400	True	73	7410204094582985771	200
500	True	70	7410204094582985771	200
600	True	82	7410204094582985771	200
700	True	69	7410204094582985771	200
800	True	72	7410204094582985771	200
900	True	74	7410204094582985771	200
1000	True	72	7410204094582985771	200
1100	True	63	7410204094582985771	200
1200	True	76	7410204094582985771	200
1300	True	73	7410204094582985771	200
1400	True	71	7410204094582985771	200
1500	True	72	7410204094582985771	200
1600	True	67	7410204094582985771	200
1648	False	41	7410204094582985771	200


---- 20230115 to 20230131----
100	True	65	7410204094583018539	200
200	True	65	7410204094583018539	200
300	True	75	7410204094583018539	200
400	True	72	7410204094583018539	200
500	True	74	7410204094583018539	200
600	True	68	7410204094583018539	200
700	True	58	7410204094583018539	200
800	True	74	7410204094583018539	200
900	True	56	74102040945830185

KeyError: 'search_id'

In [9]:
status_code = 200
year=2024; month=7; start_day=1; end_day=31

while status_code != 429:
    month_str = '0'+str(month) if month<10 else str(month)
    start_day_str = '0'+str(start_day) if start_day<10 else str(start_day)
    end_day_str = '0'+str(end_day) if end_day<10 else str(end_day)

    start_date = f"{year}{month_str}{start_day_str}"
    end_date = f"{year}{month_str}{end_day_str}"
    
    status_code = query_api_video(endpoint_name=endpoint_name, fields=fields, 
              query=query, cursor=cursor, start_date=start_date, 
              end_date=end_date, save_folder=video_folder)
    # start_day += 1
    # end_day += 1
    break
    

100	True	76	7410188393734247467	200


KeyboardInterrupt: 