In [7]:
import requests
import pandas as pd
import time
import re
from tqdm import tqdm
import json
import glob
import os

print("‚úÖ Setup Complete. Libraries for API requests are installed.")

‚úÖ Setup Complete. Libraries for API requests are installed.


In [11]:
from google.colab import drive
import os

print("Mounting Google Drive...")
drive.mount('/content/drive')
print("‚úÖ Google Drive mounted successfully.")

OUTPUT_FOLDER = '/content/drive/MyDrive/TikTok_Scraping_Output'
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
print(f"All output files will be saved to: {OUTPUT_FOLDER}")

Mounting Google Drive...
Mounted at /content/drive
‚úÖ Google Drive mounted successfully.
All output files will be saved to: /content/drive/MyDrive/TikTok_Scraping_Output


In [13]:
# =============================================================================
# Cell 3: Modular Functions (Updated for Drive)
# =============================================================================

def get_processed_ids(output_folder):
    """Scans the specified folder in Google Drive for existing batch files."""
    processed_ids = set()
    batch_files = glob.glob(os.path.join(output_folder, "comments_batch_*.csv"))
    if not batch_files:
        return processed_ids, 0

    latest_batch_num = 0
    print(f"Found {len(batch_files)} existing batch files in Drive. Loading to resume...")
    for f in batch_files:
        try:
            batch_num = int(re.search(r'comments_batch_(\d+).csv', f).group(1))
            if batch_num > latest_batch_num:
                latest_batch_num = batch_num

            df = pd.read_csv(f)
            processed_ids.update(df['video_id'].astype(str).unique())
        except Exception as e:
            print(f"Could not read or parse {f}: {e}")

    print(f"Loaded {len(processed_ids)} already processed video IDs.")
    return processed_ids, latest_batch_num

def fetch_comments_page(video_id, cursor=0, count=50):
    # This function remains the same
    API_ENDPOINT = "https://www.tiktok.com/api/comment/list/"
    params = {"aid": "1988", "aweme_id": video_id, "count": count, "cursor": cursor}
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
    try:
        response = requests.get(API_ENDPOINT, params=params, headers=headers)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException:
        return None

def extract_data_from_response(response_json, video_id):
    # This function remains the same
    comments_data = []
    if not response_json or "comments" not in response_json or response_json["comments"] is None:
        return comments_data, 0, False
    for comment in response_json["comments"]:
        comments_data.append({
            'comment_id': comment.get('cid', 'N/A'),
            'video_id': video_id,
            'username': comment.get('user', {}).get('nickname', 'N/A'),
            'comment_text': comment.get('text', ''),
            'digg_count': comment.get('digg_count', 0),
            'create_time': comment.get('create_time', 0)
        })
    new_cursor = response_json.get("cursor", 0)
    has_more = response_json.get("has_more", 0) == 1
    return comments_data, new_cursor, has_more

def save_batch_to_csv(data, batch_num, output_folder):
    """Saves a batch of data to the specified folder in Google Drive."""
    if not data:
        print("Batch is empty, nothing to save.")
        return

    filename = f"comments_batch_{batch_num}.csv"
    full_path = os.path.join(output_folder, filename)
    df = pd.DataFrame(data)
    df.to_csv(full_path, index=False, encoding='utf-8-sig')
    print(f"‚úÖüíæ Batch {batch_num} saved successfully with {len(data)} comments to '{full_path}'.")

In [14]:
# =============================================================================
# Cell 4: Main Execution with Keep-Alive Loop
# =============================================================================

# --- Configuration ---
INPUT_FILE = "link1.txt"
BATCH_SIZE = 20
MAX_COMMENTS_PER_VIDEO = 100
DELAY_BETWEEN_REQUESTS = 0.5

# --- Main Execution Logic ---
# The OUTPUT_FOLDER is defined in Cell 1
all_comments_for_batch = []

processed_ids, last_batch_num = get_processed_ids(OUTPUT_FOLDER)
start_batch_num = last_batch_num + 1

try:
    with open(INPUT_FILE, 'r') as f:
        all_video_ids = {line.strip() for line in f if line.strip()}
    videos_to_process = sorted(list(all_video_ids - processed_ids))
    if not videos_to_process:
        print("‚úÖ All video IDs from the input file have already been processed.")
    else:
        print(f"Found {len(all_video_ids)} total IDs. {len(processed_ids)} already processed.")
        print(f"Starting to scrape {len(videos_to_process)} new video IDs.")
except FileNotFoundError:
    print(f"‚ùå ERROR: Input file not found at '{INPUT_FILE}'. Please upload it.")
    videos_to_process = []

if videos_to_process:
    # ... (Main scraping loop is the same)
    batch_video_count = 0
    current_batch_num = start_batch_num
    for video_id in tqdm(videos_to_process, desc="Processing videos"):
        comments_for_this_video = []
        current_cursor = 0
        has_more_comments = True
        while len(comments_for_this_video) < MAX_COMMENTS_PER_VIDEO and has_more_comments:
            remaining_needed = MAX_COMMENTS_PER_VIDEO - len(comments_for_this_video)
            count_to_fetch = min(50, remaining_needed)
            response_data = fetch_comments_page(video_id, cursor=current_cursor, count=count_to_fetch)
            if response_data:
                comments_page, new_cursor, has_more_comments = extract_data_from_response(response_data, video_id)
                if not comments_page: break
                comments_for_this_video.extend(comments_page)
                current_cursor = new_cursor
                time.sleep(DELAY_BETWEEN_REQUESTS)
            else:
                print(f"\nSkipping video {video_id} due to API request failure.")
                break
        all_comments_for_batch.extend(comments_for_this_video)
        batch_video_count += 1
        if batch_video_count >= BATCH_SIZE:
            save_batch_to_csv(all_comments_for_batch, current_batch_num, OUTPUT_FOLDER)
            all_comments_for_batch = []
            batch_video_count = 0
            current_batch_num += 1
    if all_comments_for_batch:
        print("\nSaving final batch of leftover videos...")
        save_batch_to_csv(all_comments_for_batch, current_batch_num, OUTPUT_FOLDER)
    print(f"\n‚úÖ‚úÖ‚úÖ All Scraping finished!")



Found 2786 total IDs. 0 already processed.
Starting to scrape 2786 new video IDs.


Processing videos:   1%|          | 20/2786 [00:38<1:20:27,  1.75s/it]

‚úÖüíæ Batch 1 saved successfully with 1577 comments to '/content/drive/MyDrive/TikTok_Scraping_Output/comments_batch_1.csv'.


Processing videos:   1%|‚ñè         | 40/2786 [01:17<1:16:12,  1.67s/it]

‚úÖüíæ Batch 2 saved successfully with 1843 comments to '/content/drive/MyDrive/TikTok_Scraping_Output/comments_batch_2.csv'.


Processing videos:   2%|‚ñè         | 53/2786 [01:43<1:29:06,  1.96s/it]


KeyboardInterrupt: 