In [10]:
import pandas as pd
import re
import nltk
from indoNLP.preprocessing import (
    pipeline,
    replace_slang,
    replace_word_elongation,
    emoji_to_words,
    remove_html,
    remove_url,
    remove_stopwords
)
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()

In [13]:
# Build IndoNLP preprocessing pipeline
text_pipe = pipeline([
    lambda text: emoji_to_words(text, lang='id'),         # Convert emojis to words
    remove_html,            # Remove HTML tags
    remove_url,             # Remove URLs
    replace_slang,          # Replace slang
    replace_word_elongation,# Fix elongated words
    remove_stopwords        # Remove stop words
])

In [14]:
def clean_text(text):
    """
    Additional cleaning not covered by IndoNLP:
    - Remove non-ASCII symbols
    - Remove punctuation except letters & numbers
    """
    text = str(text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    return text.strip()

In [15]:
def preprocess(text, stem=True):
    """
    Full preprocessing:
    - Clean text
    - Apply IndoNLP pipeline
    - Lowercase
    - Tokenize
    - Optional stemming
    """
    text = clean_text(text)
    text = text_pipe(text)
    text = text.lower()

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Optional stemming
    if stem:
        tokens = [stemmer.stem(t) for t in tokens]

    return " ".join(tokens)


In [16]:
def main(file_name, output_file="comments_preprocessed_post_labeling.csv"):
    # Read CSV
    df = pd.read_csv(file_name)

    # Apply preprocessing
    df['Post_Preprocessed_Comments'] = df['Preprocessed_comment'].apply(preprocess)

    # Drop original column
    df.drop(columns=['Preprocessed_comment'], inplace=True)

    # Replace empty strings with NA and drop rows missing labels
    df['Post_Preprocessed_Comments'] = df['Post_Preprocessed_Comments'].replace('', pd.NA)
    df.dropna(subset=['Post_Preprocessed_Comments', 'label'], inplace=True)

    # Save to CSV
    df.to_csv(output_file, index=False)
    print(f"Preprocessed data saved to {output_file}")

In [17]:
main("comments_labeled.csv")

Preprocessed data saved to comments_preprocessed_post_labeling.csv
