In [27]:
#pip install langdetect
#pip install emoji
#pip install pandas

In [28]:
import pandas as pd
import re
import emoji
from langdetect import detect, DetectorFactory

DetectorFactory.seed = 0

In [29]:
# Remove emoji
def remove_emoji(text):
    return emoji.replace_emoji(str(text), replace='')

# Remove non-ascii (tambahan untuk clean karakter aneh)
def remove_non_ascii(text):
    return ''.join(c for c in text if ord(c) < 128 and c.isprintable())

# Remove non-alphabetic character
def remove_non_alpha(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

# Full cleaning function
def clean_text(text):
    text = str(text).lower()
    text = remove_emoji(text)
    text = remove_non_ascii(text)
    text = remove_non_alpha(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Filter bahasa Indonesia
def is_indonesian(text):
    try:
        return detect(text) == 'id'
    except:
        return False

# Full cleaning pipeline for dataframe
def clean_dataframe(df, text_column):
    df[text_column] = df[text_column].astype(str)
    df['clean_text'] = df[text_column].apply(clean_text)
    df = df[df['clean_text'].apply(is_indonesian)]
    df = df.reset_index(drop=True)
    return df

In [30]:
# PARSING TIKTOK (SEMICOLON PARSING)
import csv

def parse_tiktok_file(file_path):
    df = pd.read_csv(file_path, delimiter=';', quoting=csv.QUOTE_NONE, on_bad_lines='skip')
    return df

In [31]:
# LOAD & PARSE FILE
# Raw file paths

news_comments_path = '../data/raw_comment_data.csv'
news_path = '../data/raw_data.csv'  # <- tidak kita olah sekarang
tiktok_path_1 = '../data/tiktok_comments_raw_1.csv'
tiktok_path_2 = '../data/tiktok_comments_raw_2.csv'
tiktok_path_3 = '../data/tiktok_comments_raw_3.csv'
youtube_path = '../data/youtube_comments.csv'

In [32]:
# 1️⃣ Parse News Comments
df_news_comments = pd.read_csv(news_comments_path, encoding='utf-8')
df_news_comments_clean = clean_dataframe(df_news_comments, text_column='komentar')

In [33]:
# 2️⃣ Parse YouTube
df_youtube = pd.read_csv(youtube_path, encoding='utf-8')
df_youtube_clean = clean_dataframe(df_youtube, text_column='comment')

In [34]:
# 3️⃣ Parse TikTok files
tiktok_1 = parse_tiktok_file(tiktok_path_1)
tiktok_2 = parse_tiktok_file(tiktok_path_2)
tiktok_3 = parse_tiktok_file(tiktok_path_3)

# Clean TikTok3 dataframes
tiktok_3 = tiktok_3.dropna(subset=['Comment'])

In [35]:
# Combine TikTok dataframes
tiktok_all = pd.concat([tiktok_1, tiktok_2, tiktok_3], ignore_index=True)
df_tiktok_clean = clean_dataframe(tiktok_all, text_column='Comment')

In [36]:
# SAVE OUTPUT


df_news_comments_clean.to_csv('../data/news_comments_clean.csv', index=False)
df_youtube_clean.to_csv('../data/youtube_comments_clean.csv', index=False)
df_tiktok_clean.to_csv('../data/tiktok_comments_clean.csv', index=False)


print("✅ All datas already parsed and cleaned!")

✅ All datas already parsed and cleaned!


In [37]:
# pip install pandas emoji langdetect