In [1]:
# =============================================================================
# Cell 1: Mount Google Drive & Define Paths
# =============================================================================
from google.colab import drive
import os
import glob
import pandas as pd
import re
from tqdm import tqdm

# Register tqdm for pandas .progress_apply()
tqdm.pandas()

print("Mounting Google Drive...")
drive.mount('/content/drive')
print("✅ Google Drive mounted successfully.")

# --- Configuration ---
# Define the folder where your raw batch files are stored
SOURCE_FOLDER = '/content/drive/MyDrive/TikTok_Scraping_Output'

# Define the folder where the clean text files will be saved
DESTINATION_FOLDER = '/content/drive/MyDrive/Preprocessed_Comments'

# Create the destination folder if it doesn't already exist
os.makedirs(DESTINATION_FOLDER, exist_ok=True)
print(f"Source folder:      {SOURCE_FOLDER}")
print(f"Destination folder: {DESTINATION_FOLDER}")

Mounting Google Drive...
Mounted at /content/drive
✅ Google Drive mounted successfully.
Source folder:      /content/drive/MyDrive/TikTok_Scraping_Output
Destination folder: /content/drive/MyDrive/Preprocessed_Comments


In [8]:
# =============================================================================
# Cell 2: The Preprocessing Function
# =============================================================================

def preprocess_text(text):
    """
    Applies all the required preprocessing steps to a single comment string.
    1. Removes tagging (@username)
    2. Converts to lowercase
    3. Removes emojis and any characters not in the allowed set
    4. Allowed set: Vietnamese alphabet, numbers, standard keyboard punctuation/symbols, and whitespace.
    """
    if not isinstance(text, str):
        return ""

    # 1. Remove tagging (@username)
    text = re.sub(r'@[\w\.]+', '', text)

    # 2. Convert to lowercase
    text = text.lower()

    # 3. Remove any character that is NOT:
    #    - a-z (basic Latin alphabet)
    #    - Vietnamese accented characters
    #    - 0-9 (numbers)
    #    - Whitespace
    #    - Standard keyboard punctuation and symbols
    # This comprehensive regex handles emoji removal and non-Vietnamese character removal in one step.
    allowed_chars = r'a-zàáãạảăằắẵặẳâầấẫậẩđèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹ0-9\s!@#$%^&*()_+\-={}\[\]~`,./<>?:;\"\''
    text = re.sub(f'[^{allowed_chars}]', '', text)

    # Remove extra whitespace that may have been created
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [9]:
# =============================================================================
# Cell 3: Main Execution - Load, Process, and Save as CSV
# =============================================================================

# --- 1. Load all raw data ---
all_batch_files = glob.glob(os.path.join(SOURCE_FOLDER, "comments_batch_*.csv"))

if not all_batch_files:
    print("❌ No batch files found in the source folder. Please check the path.")
else:
    print(f"Found {len(all_batch_files)} batch files. Loading and concatenating...")
    df_list = [pd.read_csv(f) for f in all_batch_files]
    df = pd.concat(df_list, ignore_index=True)
    initial_count = len(df)
    print(f"✅ Loaded a total of {initial_count} comments.")

    # --- 2. Remove duplicates based on comment_id ---
    # This is a fast way to get rid of identical rows scraped from overlapping runs.
    df.drop_duplicates(subset=['comment_id'], inplace=True)
    count_after_id_dedupe = len(df)
    print(f"Removed {initial_count - count_after_id_dedupe} duplicate rows based on comment_id.")

    # --- 3. Preprocess the comments ---
    print("\nStarting preprocessing...")
    df['clean_text'] = df['comment_text'].progress_apply(preprocess_text)
    print("✅ Preprocessing complete.")

    # --- 4. Remove rows where the clean_text is now empty ---
    df.dropna(subset=['clean_text'], inplace=True)
    df = df[df['clean_text'] != '']
    count_after_empty_removal = len(df)
    print(f"Removed {count_after_id_dedupe - count_after_empty_removal} comments that became empty after cleaning.")

    # --- 5. Remove duplicates based on the clean comment content ---
    # This keeps the first occurrence of a comment if multiple people posted the exact same thing.
    df.drop_duplicates(subset=['clean_text'], keep='first', inplace=True)
    final_count = len(df)
    print(f"Removed {count_after_empty_removal - final_count} duplicate comments based on clean_text content.")
    print(f"✅ Final dataset contains {final_count} unique, non-empty comments.")

    # --- 6. Save the final DataFrame in batches ---
    print("\nSaving clean data to new CSV files...")
    BATCH_SIZE = 1000
    num_files = (final_count // BATCH_SIZE) + (1 if final_count % BATCH_SIZE > 0 else 0)

    for i, start_index in enumerate(range(0, final_count, BATCH_SIZE)):
        end_index = start_index + BATCH_SIZE
        batch_df = df.iloc[start_index:end_index]

        output_filename = os.path.join(DESTINATION_FOLDER, f"clean_comments_batch_{i+1:04d}.csv")
        batch_df.to_csv(output_filename, index=False, encoding='utf-8-sig')

    print(f"\n✅✅✅ All done! Saved {num_files} CSV files to '{DESTINATION_FOLDER}'.")

Found 2 batch files. Loading and concatenating...
✅ Loaded a total of 3420 comments.
Removed 81 duplicate rows based on comment_id.

Starting preprocessing...


100%|██████████| 3339/3339 [00:00<00:00, 140383.33it/s]

✅ Preprocessing complete.
Removed 983 comments that became empty after cleaning.
Removed 206 duplicate comments based on clean_text content.
✅ Final dataset contains 2150 unique, non-empty comments.

Saving clean data to new CSV files...

✅✅✅ All done! Saved 3 CSV files to '/content/drive/MyDrive/Preprocessed_Comments'.



