In [35]:
import pandas as pd
import re
import nltk
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [37]:
df = pd.read_csv("comments_data.csv")

In [38]:
df = df[['Comment']].dropna()

# Prepare Indonesian stopwords & stemmer
stop_factory = StopWordRemoverFactory()
stopwords = set(stop_factory.get_stop_words())
stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()

In [39]:
def clean_text(text):
    text = str(text)

    # Remove emojis (unicode ranges)
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)

    # Remove URL
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove @mentions and #hashtags
    text = re.sub(r'[@#]\w+', '', text)

    # Remove non ASCII (foreign symbols, chinese, etc)
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Remove punctuation & symbols except letters and space
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)

    # Remove extra whitespace and digits (optional: remove \d+ to keep numbers)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

def preprocess(text):
    text = clean_text(text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    tokens = [t for t in tokens if t not in stopwords]

    # Stemming
    tokens = [stemmer.stem(t) for t in tokens]
    
    return " ".join(tokens)

In [40]:
df['Processed_Comment'] = df['Comment'].apply(preprocess)


df.drop(columns=['Comment'], inplace=True)

df['Processed_Comment'].replace('', pd.NA, inplace=True)  # ubah string kosong jadi NaN
df.dropna(subset=['Processed_Comment'], inplace=True)      

print(df.head())

df.to_csv("comments_preprocessed.csv", index=False)

                                   Processed_Comment
0  sonang jalan keluo eee korek jelah segaleeee j...
1  amat bijak publik kacau malah kayak amat titip...
2  yg keruk untung vie proyek woosh mulyono san l...
3        perintah jokowi jga macem macem sama jokowi
4  jangan bias bahas bukan masalah bayar bunga bi...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Processed_Comment'].replace('', pd.NA, inplace=True)  # ubah string kosong jadi NaN
