In [37]:
import pandas as pd
import re
import nltk
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [39]:
# Prepare Indonesian stopwords & stemmer
stop_factory = StopWordRemoverFactory()
stopwords = set(stop_factory.get_stop_words())
stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()

In [40]:
def clean_text(text):
    text = str(text)

    # Remove emojis (unicode ranges)
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)

    # Remove @mentions and #hashtags
    text = re.sub(r'[@#]\w+', '', text)

    # Remove non ASCII (foreign symbols, chinese, etc)
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Remove punctuation & symbols except letters and space
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)

    return text.strip()

def preprocess(text):
    text = clean_text(text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    tokens = [t for t in tokens if t not in stopwords]

    # Stemming
    tokens = [stemmer.stem(t) for t in tokens]
    
    return " ".join(tokens)

In [41]:
def main(file_name):
    df = pd.read_csv(file_name)

    df['Post_Preprocessed_Comments'] = df['Preprocessed_comment'].apply(preprocess)

    df.drop(columns=['Preprocessed_comment'], inplace=True)

    df['Post_Preprocessed_Comments'] = df['Post_Preprocessed_Comments'].replace('', pd.NA)
    df.dropna(subset=['Post_Preprocessed_Comments', 'label'], inplace=True)

    # save to csv
    df.to_csv("comments_preprocessed_post_labeled.csv", index=False)

In [42]:
main("comments_labeled.csv")