## Cleaning Data

#### 1. Menfilter Hanya Komentar Berbahasa Inggris

In [1]:
#Buat folder output filter_bahasa_inggris jika belum ada

import os

output_dir = "filter_bahasa_inggris"
os.makedirs(output_dir, exist_ok=True)
print(f"Folder penyimpanan siap: {output_dir}")

Folder penyimpanan siap: filter_bahasa_inggris


In [None]:
# Ekstrak & filter komentar dari MongoDB

import sys

# Tambahkan path parent folder ke sys.path supaya bisa impor collect_data.connection.py nya
sys.path.append(os.path.abspath(".."))

import csv
from langdetect import detect, DetectorFactory, LangDetectException
from tqdm.notebook import tqdm
from collect_data.connection import get_db

# Supaya hasil deteksi bahasa konsisten
DetectorFactory.seed = 0

# Fungsi deteksi bahasa, default ke Inggris jika tidak terdeteksi
def is_english(text):
    try:
        return detect(text) == "en"
    except LangDetectException:
        # Jika tidak bisa deteksi, anggap sebagai bahasa Inggris
        return True
    except:
        return False

# Fungsi ekstraksi komentar dari satu koleksi
def extract_english_comments(collection_name):
    print(f"Mengekstrak dari koleksi: {collection_name}")
    db = get_db("db_data_kotor")
    collection = db[collection_name]

    english_comments = []
    cursor = collection.find()

    # Mengambil Data Dokument pada Collection nya
    for doc in tqdm(cursor, desc=f"Memfilter komentar di {collection_name}"):
        try:
            comment_data = doc["snippet"]["topLevelComment"]["snippet"]
            comment_text = comment_data["textDisplay"]
            if is_english(comment_text):
                english_comments.append({
                    "video_id": doc["snippet"]["videoId"],
                    "author_name": comment_data["authorDisplayName"],
                    "comment_text": comment_text,
                    "published_at": comment_data["publishedAt"],
                    "updated_at": comment_data["updatedAt"]
                })
        except KeyError:
            continue

    return english_comments

# Daftar koleksi 
collection_names = [
    "video_1", "video_2", "video_3", "video_4",
    "video_5", "video_6", "video_7", "video_8"
]

# Proses setiap koleksi dan simpan ke CSV terpisah
for name in collection_names:
    comments = extract_english_comments(name)

    # jika ada komentarnya maka akan menulis file csv sesuai nama collectionnya
    if comments:
        csv_filename = os.path.join(output_dir, f"{name}.csv")
        with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.DictWriter(file, fieldnames=["video_id", "author_name", "comment_text", "published_at", "updated_at"])
            writer.writeheader()
            writer.writerows(comments)

        print(f"Disimpan: {csv_filename} ({len(comments)} komentar)")
    else:
        print(f"Tidak ada komentar Inggris pada: {name}")


Mengekstrak dari koleksi: video_1


Memfilter komentar di video_1: 0it [00:00, ?it/s]

Disimpan: filter_bahasa_inggris\video_1.csv (4520 komentar)
Mengekstrak dari koleksi: video_2


Memfilter komentar di video_2: 0it [00:00, ?it/s]

Disimpan: filter_bahasa_inggris\video_2.csv (2812 komentar)
Mengekstrak dari koleksi: video_3


Memfilter komentar di video_3: 0it [00:00, ?it/s]

Disimpan: filter_bahasa_inggris\video_3.csv (5134 komentar)
Mengekstrak dari koleksi: video_4


Memfilter komentar di video_4: 0it [00:00, ?it/s]

Disimpan: filter_bahasa_inggris\video_4.csv (3391 komentar)
Mengekstrak dari koleksi: video_5


Memfilter komentar di video_5: 0it [00:00, ?it/s]

Disimpan: filter_bahasa_inggris\video_5.csv (4087 komentar)
Mengekstrak dari koleksi: video_6


Memfilter komentar di video_6: 0it [00:00, ?it/s]

Disimpan: filter_bahasa_inggris\video_6.csv (4089 komentar)
Mengekstrak dari koleksi: video_7


Memfilter komentar di video_7: 0it [00:00, ?it/s]

Disimpan: filter_bahasa_inggris\video_7.csv (3488 komentar)
Mengekstrak dari koleksi: video_8


Memfilter komentar di video_8: 0it [00:00, ?it/s]

Disimpan: filter_bahasa_inggris\video_8.csv (4819 komentar)


In [4]:
import pandas as pd

# Tampilkan 10 komentar pertama dari salah satu file
sample_file = os.path.join(output_dir, f"{collection_names[0]}.csv")
if os.path.exists(sample_file):
    df = pd.read_csv(sample_file)
    print(f"Contoh 10 komentar pertama dari", collection_names[0])
    display(df.head(10))
else:
    print("File contoh tidak ditemukan.")

Contoh 10 komentar pertama dari video_1


Unnamed: 0,video_id,author_name,comment_text,published_at,updated_at
0,fK85SQzm0Z0,@areyoufreaked13,5:54:02 🤣🤣🤣,2025-05-25T16:30:34Z,2025-05-25T16:30:34Z
1,fK85SQzm0Z0,@jolsonomaha,Ishowspeed can I draw a Luffy for you,2025-05-25T14:28:43Z,2025-05-25T14:28:43Z
2,fK85SQzm0Z0,@Hoshinomykisah,2:39:54,2025-05-23T21:15:52Z,2025-05-23T21:15:52Z
3,fK85SQzm0Z0,@astro-zeet4028,he still got that luffy cover,2025-05-21T11:17:19Z,2025-05-21T11:17:19Z
4,fK85SQzm0Z0,@JENNIEKIMSWORLD,Koreans sparked a revolution a new movement a ...,2025-05-19T04:16:16Z,2025-05-19T04:16:16Z
5,fK85SQzm0Z0,@noeldenis404,😅😶‍🌫️,2025-05-17T12:51:55Z,2025-05-17T12:51:55Z
6,fK85SQzm0Z0,@noeldenis404,456,2025-05-17T12:50:51Z,2025-05-17T12:50:51Z
7,fK85SQzm0Z0,@noeldenis404,991,2025-05-17T12:50:36Z,2025-05-17T12:50:36Z
8,fK85SQzm0Z0,@noeldenis404,7-6@134533,2025-05-17T12:48:03Z,2025-05-17T12:48:03Z
9,fK85SQzm0Z0,@richabiswakarma3137,Speed and Jackson wang together I don't want...,2025-05-17T08:36:17Z,2025-05-17T08:36:17Z


#### 2. Membersihkan Komentar dari Emoji, Tanda Baca, Angka,Menghapus Duplikasi pada Komentar, Normalisasi Tanggal, dan Komentar Kosong

In [None]:
import os
import csv
import re
from datetime import datetime
import pandas as pd
from tqdm import tqdm

# Folder input dan output
input_folder = "filter_bahasa_inggris"
output_folder = "cleaned_english_comments"
os.makedirs(output_folder, exist_ok=True)

# Pola emoji custom
CUSTOM_EMOJI_PATTERNS = [
    r'\b(?:smiling|face|eyes|tears|lol|clap|fire|heart|laugh|rolling|cry|joy|wow|omg|cool|grin|blush|wink|sad|angry|love|shock|sleep|zzz|party|thinking|Thicc|BenNo|BenYes|Gyatt|WellWell|Sewey2|Demon|GodisGood|Banana|Box|football|Gay|harold|LLL|pikatchu|sus|monkey|uno|TIMEOUT)[a-z]*\b'
]
# Membuat Regex bedasarkan pola emoji
emoji_regex = re.compile("|".join(CUSTOM_EMOJI_PATTERNS), re.IGNORECASE)

# Pola URL/link
URL_PATTERN = re.compile(r'(https?://\S+|www\.\S+|\S+\.(com|org|net|ly|io|co|me)\b)', re.IGNORECASE)

def clean_comment(text):
    text = text.lower()
    text = emoji_regex.sub('', text) #Hapus Emoji
    text = re.sub(r'\d+', '', text) #Hapus angka
    text = re.sub(URL_PATTERN, '', text)  # Hapus URL
    text = re.sub(r'[^\w\s]', '', text) #Hapus Tanda Baca
    text = re.sub(r'\s+', ' ', text).strip() #Hapus spasi berlebihan
    return text

def normalize_date(date_str):
    try:
        return datetime.fromisoformat(date_str.replace("Z", "")).strftime("%Y-%m-%d")
    except:
        return date_str

# Proses pembersihan
# Proses Looping Untuk Membaca file csv
for filename in os.listdir(input_folder):
    if filename.endswith(".csv"):
        input_path = os.path.join(input_folder, filename)
        df = pd.read_csv(input_path)

        required_columns = {"video_id", "author_name", "comment_text", "published_at", "updated_at"}
        if not required_columns.issubset(set(df.columns)):
            print(f"Kolom tidak lengkap di file: {filename}")
            continue

        print(f"Memproses: {filename}")
        
        # Proses Bersihkan komentar
        df["comment_text"] = [clean_comment(t) for t in tqdm(df["comment_text"].astype(str), desc="Membersihkan komentar")]

        # Hapus komentar kosong
        df = df[df["comment_text"].str.strip() != ""]

        # Hapus komentar duplikat berdasarkan user, komentar, dan video
        df = df.drop_duplicates(subset=["video_id", "author_name", "comment_text"])

        # Normalisasi tanggal
        df["published_at"] = df["published_at"].astype(str).apply(normalize_date)
        df["updated_at"] = df["updated_at"].astype(str).apply(normalize_date)

        # Simpan hasil
        output_path = os.path.join(output_folder, filename)
        df.to_csv(output_path, index=False, encoding="utf-8")
        print(f"Disimpan: {output_path} ({len(df)} komentar)")


Memproses: video_1.csv


Membersihkan komentar: 100%|██████████| 4520/4520 [00:00<00:00, 23992.54it/s]


Disimpan: cleaned_english_comments\video_1.csv (4033 komentar)
Memproses: video_2.csv


Membersihkan komentar: 100%|██████████| 2812/2812 [00:00<00:00, 16157.46it/s]


Disimpan: cleaned_english_comments\video_2.csv (2542 komentar)
Memproses: video_3.csv


Membersihkan komentar: 100%|██████████| 5134/5134 [00:00<00:00, 18974.50it/s]


Disimpan: cleaned_english_comments\video_3.csv (4672 komentar)
Memproses: video_4.csv


Membersihkan komentar: 100%|██████████| 3391/3391 [00:00<00:00, 18209.74it/s]


Disimpan: cleaned_english_comments\video_4.csv (3013 komentar)
Memproses: video_5.csv


Membersihkan komentar: 100%|██████████| 4087/4087 [00:00<00:00, 17973.48it/s]


Disimpan: cleaned_english_comments\video_5.csv (3690 komentar)
Memproses: video_6.csv


Membersihkan komentar: 100%|██████████| 4089/4089 [00:00<00:00, 14373.26it/s]


Disimpan: cleaned_english_comments\video_6.csv (3690 komentar)
Memproses: video_7.csv


Membersihkan komentar: 100%|██████████| 3488/3488 [00:00<00:00, 17650.75it/s]


Disimpan: cleaned_english_comments\video_7.csv (3224 komentar)
Memproses: video_8.csv


Membersihkan komentar: 100%|██████████| 4819/4819 [00:00<00:00, 16508.03it/s]


Disimpan: cleaned_english_comments\video_8.csv (4461 komentar)


In [2]:
import pandas as pd
# Folder hasil
output_folder = "cleaned_english_comments"

# Ganti nama file sesuai yang ingin dilihat
filename = "video_1.csv"  # misalnya file hasil bersih dari video_1
file_path = os.path.join(output_folder, filename)

# Baca dan tampilkan 10 komentar
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print(f"Contoh 10 komentar dari: {filename}")
    display(df[["video_id", "author_name", "comment_text", "published_at", "updated_at"]].head(10))
else:
    print("File tidak ditemukan:", filename)


Contoh 10 komentar dari: video_1.csv


Unnamed: 0,video_id,author_name,comment_text,published_at,updated_at
0,fK85SQzm0Z0,@jolsonomaha,ishowspeed can i draw a luffy for you,2025-05-25,2025-05-25
1,fK85SQzm0Z0,@astro-zeet4028,he still got that luffy cover,2025-05-21,2025-05-21
2,fK85SQzm0Z0,@JENNIEKIMSWORLD,koreans sparked a revolution a new movement a ...,2025-05-19,2025-05-19
3,fK85SQzm0Z0,@richabiswakarma3137,speed and jackson wang together i dont want an...,2025-05-17,2025-05-17
4,fK85SQzm0Z0,@in_uruguay,translator is assssh,2025-05-15,2025-05-15
5,fK85SQzm0Z0,@JENNIEKIMSWORLD,koreans sparked a revolution a new movement a ...,2025-05-09,2025-05-09
6,fK85SQzm0Z0,@Sam_A_Sam,this makes me really chinese people,2025-05-05,2025-05-05
7,fK85SQzm0Z0,@Sam_A_Sam,the basketball street guy was the most interes...,2025-05-04,2025-05-04
8,fK85SQzm0Z0,@Lucky123-h5r,china tour the best thing that happened to ish...,2025-05-04,2025-05-04
9,fK85SQzm0Z0,@fallen_angel_CZ,i go to china every year and youre telling me ...,2025-05-04,2025-05-04


#### 3. Menghapus Spam dan Komentar Kosong

In [None]:

# Folder input dan output baru
input_folder = "cleaned_english_comments"
output_folder = "cleaned_no_spam_comments"
os.makedirs(output_folder, exist_ok=True)

# Daftar kata kunci spam
spam_keywords = {
    "subscribe", "visit my channel", "check my channel", "follow me", "watch my video",
    "support me", "free money", "win prize", "get rich", "earn cash", "free gift",
    "click here", "claim now", "link in bio", "see below", "more info", "check link",
    "visit link", "pls like", "spam alert", "bot comment", "auto comment", "buy now",
    "shop here", "deal today", "limited offer", "exclusive content", "giveaway", "promo code",
    "fast cash", "big win", "you won", "click this", "don't miss", "make money", "cash app",
    "bitcoin giveaway", "100% free", "dm me", "message me", "contact me", "act now",
    "urgent offer", "join now", "download now", "hot girls", "xxx", "onlyfans", "telegram group",
    "vip access", "earn bitcoin", "credit card", "loan approval", "investment opportunity",
    "referral code", "deal", "100% legit", "check bio", "cheap price",
    "lowest price", "guaranteed", "earn daily", "instantly rich",
    "get followers", "boost followers", "click the link", "like back", "real", "legit",
    "don't skip", "no scam", "real account", "free followers", "click below", "amazing offer",
    "too good to miss", "sponsored post", "giveaway now", "investment tips", "buy crypto",
    "free promo", "vip group", "early access", "tap the link", "unlock content",
    "sign up now", "get verified", "sfs", "f4f", "l4l"
}

# Membuat Kamus Spam menjadi regex
spam_patterns = re.compile("|".join(re.escape(word) for word in spam_keywords), re.IGNORECASE)

# Fungsi cek spam
def is_spam(text):
    return bool(spam_patterns.search(text))

# Proses Looping untuk membaca setiap file di folder input
for filename in os.listdir(input_folder):
    if filename.endswith(".csv"):
        input_path = os.path.join(input_folder, filename)
        df = pd.read_csv(input_path)

        # Jika comment textnya 
        if "comment_text" not in df.columns:
            print(f"Kolom 'comment_text' tidak ditemukan di {filename}")
            continue

        print(f"Memproses: {filename}")
        
        # Hapus komentar kosong
        df = df.dropna(subset=["comment_text"])
        df["comment_text"] = df["comment_text"].astype(str).str.strip()
        df = df[df["comment_text"] != ""]

        # Hapus komentar spam
        # hanya komentar yang bukan spam yang akan disimpan ke dalam 
        df = df[~df["comment_text"].apply(is_spam)]

        # Simpan hasil
        output_path = os.path.join(output_folder, filename)
        df.to_csv(output_path, index=False, encoding="utf-8")
        print(f"Disimpan: {output_path} ({len(df)} komentar tersisa)")


Memproses: video_1.csv
Disimpan: cleaned_no_spam_comments\video_1.csv (3815 komentar tersisa)
Memproses: video_2.csv
Disimpan: cleaned_no_spam_comments\video_2.csv (2340 komentar tersisa)
Memproses: video_3.csv
Disimpan: cleaned_no_spam_comments\video_3.csv (4119 komentar tersisa)
Memproses: video_4.csv
Disimpan: cleaned_no_spam_comments\video_4.csv (2756 komentar tersisa)
Memproses: video_5.csv
Disimpan: cleaned_no_spam_comments\video_5.csv (3364 komentar tersisa)
Memproses: video_6.csv
Disimpan: cleaned_no_spam_comments\video_6.csv (3381 komentar tersisa)
Memproses: video_7.csv
Disimpan: cleaned_no_spam_comments\video_7.csv (2975 komentar tersisa)
Memproses: video_8.csv
Disimpan: cleaned_no_spam_comments\video_8.csv (3990 komentar tersisa)


In [54]:
import pandas as pd

# Folder hasil
output_folder = "cleaned_no_spam_comments"

# Ganti nama file sesuai yang ingin dilihat
filename = "video_1.csv"  # misalnya file hasil bersih dari video_1
file_path = os.path.join(output_folder, filename)

# Baca dan tampilkan 10 komentar
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print(f"Contoh 10 komentar dari: {filename}")
    display(df[["video_id", "author_name", "comment_text", "published_at", "updated_at"]].head(10))
else:
    print("File tidak ditemukan:", filename)


Contoh 10 komentar dari: video_1.csv


Unnamed: 0,video_id,author_name,comment_text,published_at,updated_at
0,fK85SQzm0Z0,@jolsonomaha,ishowspeed can i draw a luffy for you,2025-05-25,2025-05-25
1,fK85SQzm0Z0,@astro-zeet4028,he still got that luffy cover,2025-05-21,2025-05-21
2,fK85SQzm0Z0,@JENNIEKIMSWORLD,koreans sparked a revolution a new movement a ...,2025-05-19,2025-05-19
3,fK85SQzm0Z0,@richabiswakarma3137,speed and jackson wang together i dont want an...,2025-05-17,2025-05-17
4,fK85SQzm0Z0,@in_uruguay,translator is assssh,2025-05-15,2025-05-15
5,fK85SQzm0Z0,@JENNIEKIMSWORLD,koreans sparked a revolution a new movement a ...,2025-05-09,2025-05-09
6,fK85SQzm0Z0,@Sam_A_Sam,the basketball street guy was the most interes...,2025-05-04,2025-05-04
7,fK85SQzm0Z0,@Lucky123-h5r,china tour the best thing that happened to ish...,2025-05-04,2025-05-04
8,fK85SQzm0Z0,@fallen_angel_CZ,i go to china every year and youre telling me ...,2025-05-04,2025-05-04
9,fK85SQzm0Z0,@Nikofutbol,ishow speed,2025-05-04,2025-05-04


#### 4. Menormalisasi Kata Kata Slank

In [55]:
import os
import pandas as pd
import re
from tqdm import tqdm

# Folder input dan output
input_folder = "cleaned_no_spam_comments"
output_folder = "normalized_comments"
os.makedirs(output_folder, exist_ok=True)

# Kamus slang satu huruf
slang_dict = {
    "lol": "laughing",
    "lmao": "laughing",
    "rofl": "laughing",
    "brb": "returning",
    "idk": "unknown",
    "omg": "surprised",
    "smh": "disappointed",
    "btw": "anyway",
    "tbh": "honestly",
    "imo": "opinion",
    "imho": "opinion",
    "fyi": "information",
    "asap": "quickly",
    "bff": "friend",
    "np": "ok",
    "ttyl": "later",
    "ikr": "agreed",
    "afk": "away",
    "gg": "good",
    "idc": "indifferent",
    "thx": "thanks",
    "ty": "thanks",
    "u": "you",
    "r": "are",
    "y": "why",
    "nvm": "nevermind",
    "ootd": "outfit",
    "pov": "perspective",
    "fomo": "anxiety",
    "goat": "best",
    "fr": "real",
    "ngl": "honestly",
    "hmu": "contact",
    "irl": "real",
    "tmi": "excessive",
    "wbu": "you",
    "wyd": "doing",
    "hbd": "birthday",
    "g2g": "go",
    "msg": "message",
    "plz": "please",
    "ppl": "people",
    "sry": "sorry",
    "xoxo": "kisses",
    "yolo": "live",
    "rn": "now",
    "idgaf": "indifferent",
    "istg": "swear",
    "ftw": "win",
    "tbf": "fairly",
    "otp": "pairing",
    "rt": "retweet",
    "dm": "message",
    "pm": "message",
    "vc": "video",
    "wip": "progress",
    "w": "good", 
    "slay": "excellent",
    "simp": "flatterer",
    "ghosting": "abandoning",
    "flex": "boasting",
    "sus": "suspicious",
    "bet": "agreed",
    "lit": "amazing",
    "salty": "annoyed",
    "cringe": "awkward",
    "drip": "stylish",
    "cap": "lie",
    "lowkey": "quietly",
    "highkey": "openly",
    "yeet": "throw",
    "rizz": "charm",
    "bussin'": "delicious",
    "gucci": "good",
    "cheugy": "outdated",
    "delulu": "delusional",
    "iykyk": "understood",
    "mood": "relatable",
    "sheesh": "wow",
    "periodt": "final",
    "stan": "fanatic",
    "squad": "group",
    "fam": "friends",
    "troll": "provoke",
    "spam": "flood",
    "pwned": "defeated",
    "noob": "novice",
    "roflmao": "laughing",
    "ftfy": "fixed",
    "icymi": "missed",
    "tldr": "summary",
    "wfh": "remote",
    "af": "very",
    "chad": "confident",
    "cringey": "awkward",
    "savage": "impressive",
    "based": "opinionated",
    "boujee": "luxurious",
    "woke": "aware",
    "mid": "average",
    "tea": "gossip",
    "dope": "cool",
    "ain't": "isn't",
    "gonna": "going",
    "wanna": "want",
    "gotta": "must",
    "lemme": "let",
    "gimme": "give",
    "kinda": "somewhat",
    "sorta": "somewhat",
    "dunno": "unknown",
    "cuz": "because",
    "tho": "though",
    "thru": "through",
    "nite": "night",
    "pls": "please",
    "txt": "text",
    "ur": "your",
    "bc": "because",
    "bf": "boyfriend",
    "gf": "girlfriend",
    "jk": "joking",
    "k": "ok",
    "kk": "ok",
    "wby": "you",
    "wtf": "shocked",
    "2nite": "tonight",
    "4ever": "forever",
    "b4": "before",
    "gr8": "great",
    "l8r": "later",
    "gud": "good",
    "luv": "love",
    "ne": "any",
    "rly": "really",
    "sum1": "someone",
    "u're": "you",
    "y?": "why",
    "yep": "yes",
    "nope": "no",
    "xd": "funny"
}

def normalize_text(text, slang_dict):
    """
    Normalizes text by converting to lowercase, removing punctuation,
    and replacing slang words based on the provided dictionary.

    Args:
        text (str): The input text to normalize.
        slang_dict (dict): A dictionary mapping slang words to their normalized forms.

    Returns:
        str: The normalized text.
    """
    text = text.lower()
    # Remove punctuation using regex, keeping only word characters and spaces
    text = re.sub(r'[^\w\s]', '', text)
    # Split text into words, ensuring proper word boundaries
    words = re.findall(r'\b\w+\b', text)
    # Replace words with their normalized forms if found in slang_dict, otherwise keep original word
    normalized_words = [slang_dict.get(word, word) for word in words]
    # Join the normalized words back into a single string
    return ' '.join(normalized_words)

# Proses semua file di folder
for filename in os.listdir(input_folder):
    if filename.endswith(".csv"):
        input_path = os.path.join(input_folder, filename)
        try:
            df = pd.read_csv(input_path)
        except Exception as e:
            print(f"Gagal membaca file {filename}: {e}")
            continue

        if "comment_text" not in df.columns:
            print(f"Kolom 'comment_text' tidak ditemukan di file: {filename}")
            continue

        print(f"Memproses file: {filename}")
        df["comment_text"] = df["comment_text"].astype(str)
        # Menggunakan fungsi normalize_text yang benar dan meneruskan slang_dict
        # Kolom 'comment_text' sekarang akan langsung diperbarui dengan teks yang dinormalisasi
        df["comment_text"] = [
            normalize_text(text, slang_dict) for text in tqdm(df["comment_text"], desc="Normalisasi slang")
        ]

        output_path = os.path.join(output_folder, filename)
        try:
            df.to_csv(output_path, index=False, encoding="utf-8")
            print(f"Disimpan ke: {output_path}")
        except Exception as e:
            print(f"Gagal menyimpan file {output_path}: {e}")



Memproses file: video_1.csv


Normalisasi slang: 100%|██████████| 3815/3815 [00:00<00:00, 133001.44it/s]


Disimpan ke: normalized_comments\video_1.csv
Memproses file: video_2.csv


Normalisasi slang: 100%|██████████| 2340/2340 [00:00<00:00, 122392.71it/s]


Disimpan ke: normalized_comments\video_2.csv
Memproses file: video_3.csv


Normalisasi slang: 100%|██████████| 4119/4119 [00:00<00:00, 150446.19it/s]


Disimpan ke: normalized_comments\video_3.csv
Memproses file: video_4.csv


Normalisasi slang: 100%|██████████| 2756/2756 [00:00<00:00, 142101.14it/s]


Disimpan ke: normalized_comments\video_4.csv
Memproses file: video_5.csv


Normalisasi slang: 100%|██████████| 3364/3364 [00:00<00:00, 157607.33it/s]


Disimpan ke: normalized_comments\video_5.csv
Memproses file: video_6.csv


Normalisasi slang: 100%|██████████| 3381/3381 [00:00<00:00, 148323.80it/s]


Disimpan ke: normalized_comments\video_6.csv
Memproses file: video_7.csv


Normalisasi slang: 100%|██████████| 2975/2975 [00:00<00:00, 154902.98it/s]


Disimpan ke: normalized_comments\video_7.csv
Memproses file: video_8.csv


Normalisasi slang: 100%|██████████| 3990/3990 [00:00<00:00, 141410.90it/s]


Disimpan ke: normalized_comments\video_8.csv


In [61]:
# Folder hasil
output_folder = "normalized_comments"

# Ganti nama file sesuai yang ingin dilihat
filename = "video_1.csv"  # misalnya file hasil bersih dari video_1
file_path = os.path.join(output_folder, filename)

# Baca dan tampilkan 10 komentar
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print(f" Contoh 10 komentar dari: {filename}")
    display(df[["video_id", "author_name", "comment_text", "published_at", "updated_at"]].tail(10))
else:
    print("File tidak ditemukan:", filename)


 Contoh 10 komentar dari: video_1.csv


Unnamed: 0,video_id,author_name,comment_text,published_at,updated_at
3805,fK85SQzm0Z0,@IyKash,aint no way a greenapple on his shirt good str...,2025-03-24,2025-03-24
3806,fK85SQzm0Z0,@KINDLY7,good longest real stream,2025-03-24,2025-03-24
3807,fK85SQzm0Z0,@kaicenatleftcheek11,good stream even though speed in jail right now,2025-03-24,2025-03-24
3808,fK85SQzm0Z0,@Cartiverynice,jail stream when,2025-03-24,2025-03-24
3809,fK85SQzm0Z0,@cristiannegrete8127,he got a green apple shirt now,2025-03-24,2025-03-24
3810,fK85SQzm0Z0,@Fusion-v1,i was here,2025-03-24,2025-03-24
3811,fK85SQzm0Z0,@kiilee5963,one of the best streams good china,2025-03-24,2025-03-24
3812,fK85SQzm0Z0,@jcpesa19,henry turn the music off speed still vibing to...,2025-03-24,2025-03-24
3813,fK85SQzm0Z0,@lcfcalvin,now you dont know why this comment has so many...,2025-03-24,2025-04-16
3814,fK85SQzm0Z0,@5jqs,congrats on m good speed good hour china stream,2025-03-24,2025-03-24


#### 5. Menghapus Kata Kata Non Bahasa Inggris

In [57]:
import os
import pandas as pd
import re
from tqdm import tqdm
import nltk
from nltk.corpus import words

# Download kamus sekali saja jika belum
nltk.download("words")
english_words = set(words.words())

# Folder input/output
input_folder = "normalized_comments"
output_folder = "english_only_comments"
os.makedirs(output_folder, exist_ok=True)

def keep_english_words(text):
    if pd.isna(text):
        return ""
    tokens = re.findall(r'\b[a-zA-Z]+\b', text)  # Ambil kata alfabet
    filtered = [word for word in tokens if word.lower() in english_words]
    return " ".join(filtered)

# Proses file CSV
for filename in os.listdir(input_folder):
    if filename.endswith(".csv"):
        input_path = os.path.join(input_folder, filename)
        df = pd.read_csv(input_path)

        if "comment_text" not in df.columns:
            print(f"Kolom 'comment_text' tidak ditemukan di {filename}")
            continue

        print(f"Memproses: {filename}")
        tqdm.pandas(desc="Memfilter kata Inggris dari campuran")
        df["comment_text"] = df["comment_text"].astype(str).progress_apply(keep_english_words)

        # Hapus komentar kosong/NaN
        df = df[df["comment_text"].str.strip() != ""]
        df = df.dropna(subset=["comment_text"])

        output_path = os.path.join(output_folder, filename)
        df.to_csv(output_path, index=False, encoding="utf-8")
        print(f"Disimpan: {output_path} ({len(df)} komentar bersih)")


[nltk_data] Downloading package words to C:\Users\Agus
[nltk_data]     Handika\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


Memproses: video_1.csv


Memfilter kata Inggris dari campuran: 100%|██████████| 3815/3815 [00:00<00:00, 174465.41it/s]


Disimpan: english_only_comments\video_1.csv (3759 komentar bersih)
Memproses: video_2.csv


Memfilter kata Inggris dari campuran: 100%|██████████| 2340/2340 [00:00<00:00, 148443.99it/s]


Disimpan: english_only_comments\video_2.csv (2309 komentar bersih)
Memproses: video_3.csv


Memfilter kata Inggris dari campuran: 100%|██████████| 4119/4119 [00:00<00:00, 156729.91it/s]


Disimpan: english_only_comments\video_3.csv (4044 komentar bersih)
Memproses: video_4.csv


Memfilter kata Inggris dari campuran: 100%|██████████| 2756/2756 [00:00<00:00, 147314.85it/s]


Disimpan: english_only_comments\video_4.csv (2720 komentar bersih)
Memproses: video_5.csv


Memfilter kata Inggris dari campuran: 100%|██████████| 3364/3364 [00:00<00:00, 115856.00it/s]


Disimpan: english_only_comments\video_5.csv (3324 komentar bersih)
Memproses: video_6.csv


Memfilter kata Inggris dari campuran: 100%|██████████| 3381/3381 [00:00<00:00, 144712.35it/s]


Disimpan: english_only_comments\video_6.csv (3341 komentar bersih)
Memproses: video_7.csv


Memfilter kata Inggris dari campuran: 100%|██████████| 2975/2975 [00:00<00:00, 167609.90it/s]


Disimpan: english_only_comments\video_7.csv (2932 komentar bersih)
Memproses: video_8.csv


Memfilter kata Inggris dari campuran: 100%|██████████| 3990/3990 [00:00<00:00, 162279.86it/s]

Disimpan: english_only_comments\video_8.csv (3943 komentar bersih)





In [60]:
# Folder hasil
output_folder = "english_only_comments"

# Ganti nama file sesuai yang ingin dilihat
filename = "video_1.csv"  # misalnya file hasil bersih dari video_1
file_path = os.path.join(output_folder, filename)

# Baca dan tampilkan 10 komentar
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print(f"Contoh 10 komentar dari: {filename}")
    display(df[["video_id", "author_name", "comment_text", "published_at", "updated_at"]].tail(10))
else:
    print("File tidak ditemukan:", filename)


Contoh 10 komentar dari: video_1.csv


Unnamed: 0,video_id,author_name,comment_text,published_at,updated_at
3749,fK85SQzm0Z0,@IyKash,aint no way a on his shirt good stream though ...,2025-03-24,2025-03-24
3750,fK85SQzm0Z0,@KINDLY7,good real stream,2025-03-24,2025-03-24
3751,fK85SQzm0Z0,@kaicenatleftcheek11,good stream even though speed in jail right now,2025-03-24,2025-03-24
3752,fK85SQzm0Z0,@Cartiverynice,jail stream when,2025-03-24,2025-03-24
3753,fK85SQzm0Z0,@cristiannegrete8127,he got a green apple shirt now,2025-03-24,2025-03-24
3754,fK85SQzm0Z0,@Fusion-v1,i was here,2025-03-24,2025-03-24
3755,fK85SQzm0Z0,@kiilee5963,one of the best good china,2025-03-24,2025-03-24
3756,fK85SQzm0Z0,@jcpesa19,henry turn the music off speed still to nae na...,2025-03-24,2025-03-24
3757,fK85SQzm0Z0,@lcfcalvin,now you dont know why this comment so many,2025-03-24,2025-04-16
3758,fK85SQzm0Z0,@5jqs,on m good speed good hour china stream,2025-03-24,2025-03-24


#### 6. Simpan Data ke MongoDB

In [62]:
import pandas as pd

input_folder = "english_only_comments"
db = get_db("db_komentar_bersih")

def save_df_to_mongodb(df, collection):
    records = df.to_dict(orient='records')
    if records:
        collection.insert_many(records)
        print(f"Berhasil menyimpan {len(records)} data ke MongoDB.")

for filename in os.listdir(input_folder):
    if filename.endswith(".csv"):
        file_path = os.path.join(input_folder, filename)
        df = pd.read_csv(file_path)

        required_columns = {"video_id", "author_name", "comment_text", "published_at", "updated_at"}
        if not required_columns.issubset(set(df.columns)):
            print(f"Kolom tidak lengkap di file: {filename}, lewati file ini.")
            continue

        collection_name = os.path.splitext(filename)[0]
        collection = db[collection_name]

        print(f"Memproses file: {filename} → collection: {collection_name}")
        save_df_to_mongodb(df, collection)

print("Semua file selesai diproses.")


Memproses file: video_1.csv → collection: video_1
Berhasil menyimpan 3759 data ke MongoDB.
Memproses file: video_2.csv → collection: video_2
Berhasil menyimpan 2309 data ke MongoDB.
Memproses file: video_3.csv → collection: video_3
Berhasil menyimpan 4044 data ke MongoDB.
Memproses file: video_4.csv → collection: video_4
Berhasil menyimpan 2720 data ke MongoDB.
Memproses file: video_5.csv → collection: video_5
Berhasil menyimpan 3324 data ke MongoDB.
Memproses file: video_6.csv → collection: video_6
Berhasil menyimpan 3341 data ke MongoDB.
Memproses file: video_7.csv → collection: video_7
Berhasil menyimpan 2932 data ke MongoDB.
Memproses file: video_8.csv → collection: video_8
Berhasil menyimpan 3943 data ke MongoDB.
Semua file selesai diproses.


#### 6. Gabungkan Collection untuk Proses Labeling

In [63]:

def merge_collections_between_dbs(
    db_source, db_merge, 
    target_collection_name="merge_data"
):
    # Hapus collection target di db_merge jika sudah ada
    if target_collection_name in db_merge.list_collection_names():
        db_merge[target_collection_name].drop()
        print(f"Collection '{target_collection_name}' di db merge dihapus dulu agar fresh.")

    all_data = []

    # Ambil semua collection di db_source
    collections = db_source.list_collection_names()
    print(f"Menggabungkan data dari {len(collections)} collection di db sumber...")

    for col_name in collections:
        collection = db_source[col_name]
        docs = list(collection.find())
        print(f" - Mengambil {len(docs)} dokumen dari collection '{col_name}'")
        all_data.extend(docs)

    if all_data:
        # Hapus _id supaya tidak konflik saat insert di db_merge
        for doc in all_data:
            if "_id" in doc:
                del doc["_id"]

        db_merge[target_collection_name].insert_many(all_data)
        print(f"Berhasil menggabungkan total {len(all_data)} dokumen ke collection '{target_collection_name}' di db merge")
    else:
        print("Tidak ada data untuk digabungkan.")

# Contoh pemakaian:
db_source = get_db("db_komentar_bersih")   # ganti dengan nama db sumbermu
db_merge = get_db("db_komentar_merge")     # ganti dengan nama db merge (target)

merge_collections_between_dbs(db_source, db_merge, target_collection_name="merge_data")


Menggabungkan data dari 8 collection di db sumber...
 - Mengambil 2720 dokumen dari collection 'video_4'
 - Mengambil 2309 dokumen dari collection 'video_2'
 - Mengambil 3324 dokumen dari collection 'video_5'
 - Mengambil 3943 dokumen dari collection 'video_8'
 - Mengambil 3341 dokumen dari collection 'video_6'
 - Mengambil 3759 dokumen dari collection 'video_1'
 - Mengambil 4044 dokumen dari collection 'video_3'
 - Mengambil 2932 dokumen dari collection 'video_7'
Berhasil menggabungkan total 26372 dokumen ke collection 'merge_data' di db merge
