In [1]:
from apify_client import ApifyClient
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import random
import os
import logging
from dotenv import load_dotenv, find_dotenv # For loading .env file

In [2]:
# Cấu hình logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='app.log',
    filemode='a'
)
logger = logging.getLogger()

In [None]:
# Load .env
if load_dotenv(find_dotenv('secret.env')):
    logger.info("✅ Loaded environment variables from secret.env")
else:
    logger.warning("⚠️ Could not find secret.env, using current environment variables")

# Lấy danh sách tất cả API KEY
API_KEYS = [
    os.getenv("API_APIFY_1"),
    os.getenv("API_APIFY_2"),
    os.getenv("API_APIFY_3"),
    os.getenv("API_APIFY_4"),
    os.getenv("API_APIFY_5"),
    os.getenv("API_APIFY_6"),
    os.getenv("API_APIFY_7"),
    os.getenv("API_APIFY_8"),
    os.getenv("API_APIFY_9"),
    os.getenv("API_APIFY_10"),
    os.getenv("API_APIFY_11"),
    os.getenv("API_APIFY_12"),
    os.getenv("API_APIFY_13"),
    os.getenv("API_APIFY_14"),
    os.getenv("API_APIFY_15"),
    os.getenv("API_APIFY_16")
]

In [21]:
# Tạo danh sách username
list_kol = pd.read_excel('List_KOL_final.xlsx', sheet_name='Sheet1')
# Lọc DataFrame để chỉ lấy các hàng mà cột 'error' không phải là null
kol_with_errors = list_kol[list_kol['error'].notna()]

kol_usernames = kol_with_errors["name"].tolist()

In [22]:
list_kol.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86 entries, 0 to 85
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         86 non-null     object
 1   link         86 non-null     object
 2   error        2 non-null      object
 3   deleted_kol  4 non-null      object
dtypes: object(4)
memory usage: 2.8+ KB


In [23]:
kol_usernames

['huynhanhthu_cosmetic', 'phuonglytran']

In [None]:
# Cấu hình mặc định
base_input = {
    "excludePinnedPosts": True,
    "maxProfilesPerQuery": 1,
    "oldestPostDateUnified": "2024-01-01T00:00:00.000Z",
    "profileScrapeSections": ["videos"],
    "profileSorting": "latest",
    "resultsPerPage": 700,
    "scrapeFollowers": False,
    "scrapeFollowing": False,
    "scrapeUserStats": True,
    "shouldDownloadAvatars": False,
    "shouldDownloadCovers": False,
    "shouldDownloadMusicCovers": False,
    "shouldDownloadSlideshowImages": False,
    "shouldDownloadSubtitles": False,
    "shouldDownloadVideos": False
}

# Hàm gọi actor với nhiều API_KEY
def call_with_fallback(api_keys, run_input):
    for key in api_keys:
        client = ApifyClient(key)
        try:
            run = client.actor("clockworks/tiktok-profile-scraper").call(run_input=run_input)
            dataset_items = list(client.dataset(run["defaultDatasetId"]).iterate_items())
            logger.info(f"✅ Success with API key ending in {key[-4:]}")
            return dataset_items, key  # Trả về dữ liệu và key thành công
        except Exception as e:
            logger.error(f"❌ API key ending in {key[-4:]} failed: {e}")
    raise Exception("❌ All API keys failed")

# Crawl từng KOL có tiến độ
for username in tqdm(kol_usernames, desc="Crawling TikTok KOLs", unit="user"):
    logger.info(f"🚀 Bắt đầu crawl: {username}")
    run_input = base_input.copy()
    run_input["profiles"] = [username]

    try:
        dataset_items, success_key = call_with_fallback(API_KEYS, run_input)

        # Gắn username vào từng bài
        for item in dataset_items:
            item["kol_username"] = username

        output_file = f"{username}.json"
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(dataset_items, f, ensure_ascii=False, indent=2)

        logger.info(f"✅ Crawl thành công: {username} | {len(dataset_items)} item(s) | Key: ...{success_key[-4:]}")
        logger.info(f"📁 Đã lưu file: {output_file}")

    except Exception as final_error:
        logger.error(f"❌ Lỗi hoàn toàn với {username}: {final_error}")
        continue

print("🏁 Hoàn tất crawling tất cả KOL.")

Crawling TikTok KOLs: 100%|██████████| 2/2 [01:39<00:00, 49.94s/user]

🏁 Hoàn tất crawling tất cả KOL.





In [None]:
print('Đã cào đến hoangminhngoc21, người số 21')

Đã cào đến hoangminhngoc21, người số 21
