In [1]:
# reading hasil_scraping_ig.csv 
import pandas as pd 
df = pd.read_csv("hasil_scraping_ig.csv")

In [2]:
# Preview komentar with #menterikeuangan
df

Unnamed: 0,Link Post,Komentar
0,https://www.instagram.com/p/DQ0E5IdE0Et/,Instagram Lite
1,https://www.instagram.com/p/DQ0E5IdE0Et/,Ikut dolar
2,https://www.instagram.com/p/DQ0E5IdE0Et/,Meta in Indonesia
3,https://www.instagram.com/p/DQ0E5IdE0Et/,See translation
4,https://www.instagram.com/p/DQ0E5IdE0Et/,"November 9, 2025"
...,...,...
3768,https://www.instagram.com/p/DRHB1XGks3A/,Alhamdulillah❤️❤️❤️ memang seharusnya begitu❤️
3769,https://www.instagram.com/p/DRHB1XGks3A/,Daftar bang @antojoe199
3770,https://www.instagram.com/p/DRHB1XGks3A/,Nah gini bener
3771,https://www.instagram.com/p/DRHB1XGks3A/,Emang harusnya kan sesuai sama jobdesk nya


In [3]:
import re
# 1. Data Cleaning

# Filtering unrelevant link post 
unrelevant_posts = ["https://www.instagram.com/p/CL6tJ9lgOQf/", 
                    "https://www.instagram.com/p/CQI2LTHtXml/", 
                    "https://www.instagram.com/p/DQYxOOxk3dI/",
                    "https://www.instagram.com/p/CPSI50rDaB3/",
                    "https://www.instagram.com/p/DPogoVSlGTh/"]

unrelevant_words = ["instagram", "like", "likes", "see", "reply", "replies",
                   "meta", "facebook", "contact", "@"]

df = df[~df['Link Post'].isin(unrelevant_posts)];

# Drop unrelevant rows with the hashtag
df = df[~df['Komentar'].str.contains("|".join(unrelevant_words), case=False, na=False)]

# Drop rows that contains number
df = df[~df['Komentar'].str.contains(r'\d', na=False)];

# Case folding: Change all of the following words to smallcase
df['Komentar'] = df['Komentar'].str.lower()
df = df.reset_index(drop=True)

# remove redudant spacing
df['Komentar'] = df['Komentar'].str.strip() # Remove front and back spacing
df['Komentar'] = df['Komentar'].str.replace(r'\s+', ' ', regex=True)  # remove multiple spacing

# remove special chracter
df['Komentar'] = df['Komentar'].str.replace(r'[^A-Za-z0-9 :_]+', '', regex=True)

# remove redudant alphabet
df['Komentar'] = df['Komentar'].apply(lambda x: re.sub(r'(.)\1+', r'\1', x))

df_cleaned = df[['Komentar']].rename(columns={'Komentar': 'cleaned_comment'})
df_cleaned.to_csv("data_pre_processing.csv", index=False); df_cleaned

Unnamed: 0,cleaned_comment
0,ikut dolar
1,tombol yang mau ikut demo kalau pak purbaya di...
2,gimana ekonomi indonesia mau membaik dapet men...
3,nepalkan kalau pak purbaya di ganti
4,dukung purbaya
...,...
1558,ya emang harus
1559,atur aja pak
1560,alhamdulilah memang seharusnya begitu
1561,nah gini bener


In [4]:
# 2. Slang Replacement
slang_df = pd.read_csv("slang.csv")
data_df= pd.read_csv("data_pre_processing.csv")

slang_dict = dict(zip(slang_df['slang'], slang_df['formal']))

def replace_slang(text):
    words = text.split()
    replaced_words = [
        slang_dict[word] if word in slang_dict else word
        for word in words
    ]
    return " ".join(replaced_words)

data_df['slang_replaced_comment'] = data_df['cleaned_comment'].astype(str).apply(replace_slang)
data_df.to_csv("data_pre_processing.csv", index=False)

data_df[['cleaned_comment','slang_replaced_comment']]

Unnamed: 0,cleaned_comment,slang_replaced_comment
0,ikut dolar,ikut dolar
1,tombol yang mau ikut demo kalau pak purbaya di...,tombol yang mau ikut demo kalau bapak purbaya ...
2,gimana ekonomi indonesia mau membaik dapet men...,gimana ekonomi indonesia mau membaik dapat men...
3,nepalkan kalau pak purbaya di ganti,nepalkan kalau bapak purbaya di ganti
4,dukung purbaya,dukung purbaya
...,...,...
1557,ya emang harus,ya memang harus
1558,atur aja pak,atur saja bapak
1559,alhamdulilah memang seharusnya begitu,alhamdulilah memang seharusnya begitu
1560,nah gini bener,nah gini benar


In [5]:
# 3. Tokenizing
def tokenize(text):
    if pd.isna(text):
        return text

    tokens = text.split()
    return tokens

data_df['tekonized_comment'] = data_df['slang_replaced_comment'].apply(tokenize)
data_df.to_csv("data_pre_processing.csv", index=False)

data_df[['slang_replaced_comment','tekonized_comment']]

Unnamed: 0,slang_replaced_comment,tekonized_comment
0,ikut dolar,"[ikut, dolar]"
1,tombol yang mau ikut demo kalau bapak purbaya ...,"[tombol, yang, mau, ikut, demo, kalau, bapak, ..."
2,gimana ekonomi indonesia mau membaik dapat men...,"[gimana, ekonomi, indonesia, mau, membaik, dap..."
3,nepalkan kalau bapak purbaya di ganti,"[nepalkan, kalau, bapak, purbaya, di, ganti]"
4,dukung purbaya,"[dukung, purbaya]"
...,...,...
1557,ya memang harus,"[ya, memang, harus]"
1558,atur saja bapak,"[atur, saja, bapak]"
1559,alhamdulilah memang seharusnya begitu,"[alhamdulilah, memang, seharusnya, begitu]"
1560,nah gini benar,"[nah, gini, benar]"


In [6]:
# 4. Stop Word Removal
import ast
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# initialization stopword remover from sastrawi
factory = StopWordRemoverFactory()
stopwords = factory.get_stop_words()

# Remove the negation words
negasi = ["tidak", "nggak", "jangan", "belum"]
stopwords = [word for word in stopwords if word not in negasi]

def remove_stopwords(tokens):
    return [t for t in tokens if t not in stopwords]

data_df['no_stopwords_comment'] = data_df['tekonized_comment'].apply(remove_stopwords)
data_df.to_csv("data_pre_processing.csv", index=False)

data_df[['tekonized_comment','no_stopwords_comment']]

Unnamed: 0,tekonized_comment,no_stopwords_comment
0,"[ikut, dolar]","[ikut, dolar]"
1,"[tombol, yang, mau, ikut, demo, kalau, bapak, ...","[tombol, mau, ikut, demo, kalau, bapak, purbay..."
2,"[gimana, ekonomi, indonesia, mau, membaik, dap...","[gimana, ekonomi, indonesia, mau, membaik, men..."
3,"[nepalkan, kalau, bapak, purbaya, di, ganti]","[nepalkan, kalau, bapak, purbaya, ganti]"
4,"[dukung, purbaya]","[dukung, purbaya]"
...,...,...
1557,"[ya, memang, harus]",[memang]
1558,"[atur, saja, bapak]","[atur, bapak]"
1559,"[alhamdulilah, memang, seharusnya, begitu]","[alhamdulilah, memang]"
1560,"[nah, gini, benar]","[nah, gini, benar]"


In [8]:
# 5. Stemming - changing it to the base word
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tqdm import tqdm                  # progress bar
from joblib import Parallel, delayed   # parallel processing
from joblib import Memory

memory = Memory(location='cachedir', verbose=0)

# stemmer initialization
factory = StemmerFactory()
stemmer = factory.create_stemmer()

@memory.cache
def stemmed_wrapper(term):
    return stemmer.stem(term)

unique_terms = set()
for document in tqdm(data_df['no_stopwords_comment'], desc="Collecting terms"):
    if isinstance(document, str):
        unique_terms.update(document.split())  
    else:
        unique_terms.update(document)          

print("Melakukan stemming untuk semua kata unik")
results = Parallel(n_jobs=-1)(
    delayed(stemmed_wrapper)(term) for term in tqdm(unique_terms, desc="Stemming terms")
)

# stemming Dictionary
term_dict = dict(zip(unique_terms, results))

# stemming each word
def stemmingText(document):
    if isinstance(document, str):
        terms = document.split()
    else:
        terms = document
    return [term_dict[term] for term in terms]

data_df["stemmed_comment"] = data_df["no_stopwords_comment"].apply(stemmingText)
data_df.to_csv("data_pre_processing.csv", index=False)

data_df[["no_stopwords_comment", "stemmed_comment"]]

Mengumpulkan kata unik


Collecting terms: 100%|████████████████████████████████████████████████████████| 1562/1562 [00:00<00:00, 306632.17it/s]


Melakukan stemming untuk semua kata unik


Stemming terms: 100%|██████████████████████████████████████████████████████████████| 3775/3775 [02:09<00:00, 29.14it/s]


Unnamed: 0,no_stopwords_comment,stemmed_comment
0,"[ikut, dolar]","[ikut, dolar]"
1,"[tombol, mau, ikut, demo, kalau, bapak, purbay...","[tombol, mau, ikut, demo, kalau, bapak, purbay..."
2,"[gimana, ekonomi, indonesia, mau, membaik, men...","[gimana, ekonomi, indonesia, mau, baik, menter..."
3,"[nepalkan, kalau, bapak, purbaya, ganti]","[nepalkan, kalau, bapak, purbaya, ganti]"
4,"[dukung, purbaya]","[dukung, purbaya]"
...,...,...
1557,[memang],[memang]
1558,"[atur, bapak]","[atur, bapak]"
1559,"[alhamdulilah, memang]","[alhamdulilah, memang]"
1560,"[nah, gini, benar]","[nah, gin, benar]"
