In [1]:
# Combine the dataset
import pandas as pd
import glob

files = glob.glob("dataset/*.csv")

# Take only the content column
all_data = []
for f in files:
    df = pd.read_csv(f)
    if "content" in df.columns:
        all_data.append(df["content"])

# Combine them into a one column
combined = pd.concat(all_data, ignore_index=True)

# Save to new CSV
combined.to_csv("marketPlacesData.csv", index=False, header=["content"])
print("new dataset has been created")

new dataset has been created


In [2]:
# Data Cleaning

import re
import emoji
from tabulate import tabulate

df = pd.read_csv("marketPlacesData.csv")

def clean_text(text):
    if pd.isnull(text):
        return ""

    # Convert Emoji to Textual
    text = emoji.demojize(text, language="id")

    # remove special character
    text = re.sub(r'[^A-Za-z0-9\s]', ' ', text)

    # remove number
    text = re.sub(r'[0-9]+',' ', text)

    # remove multiple spacing
    text = re.sub(r"\s+", " ", text).strip()

    # Case Folding 
    text = text.lower()

    return text

df["data_clean_content"] = df["content"].apply(clean_text)
df.to_csv("cleaned_marketPlacesData.csv", index=False)

print("Data cleaning Success")
df

Data cleaning Success


Unnamed: 0,content,data_clean_content
0,Udah sering belanja trs tapi setiap pengajuan ...,udah sering belanja trs tapi setiap pengajuan ...
1,Semenjak di upgrade.. SHOPEE JADI LEMOT,semenjak di upgrade shopee jadi lemot
2,Penyelesaian masalah sangat buruk,penyelesaian masalah sangat buruk
3,Apk engga😇 jls,apk engga wajah malaikat jls
4,Lelet stress. Udah update terbaru tetap aja lemot,lelet stress udah update terbaru tetap aja lemot
...,...,...
8140,Oke 👍👍👍👍,oke jempol ke atas jempol ke atas jempol ke at...
8141,Tokopedia memang ok,tokopedia memang ok
8142,sangat membantu,sangat membantu
8143,tokopedia is the best,tokopedia is the best


In [3]:
# Slang replacement

# kamus bahasa gaul
slang_df = pd.read_csv("cleaning/slang_indo.csv")
slang_dict = dict(zip(slang_df.iloc[:,0], slang_df.iloc[:,1]))

def normalize_slang(text, slang_dict):
    if pd.isnull(text):
        return ""
    
    words = text.split()
    new_words = []
    
    for w in words:
        # cek apakah slang ada di kamus
        if w in slang_dict:
            new_words.append(slang_dict[w])
        else:
            new_words.append(w)
    
    return " ".join(new_words)

df = pd.read_csv("cleaned_marketPlacesData.csv")
df["slang_replacement_content"] = df["data_clean_content"].apply(lambda x: normalize_slang(str(x), slang_dict))
df.to_csv("cleaned_marketPlacesData.csv", index=False)

print("Slang Replacement Success")
df[["data_clean_content", "slang_replacement_content"]]

Slang Replacement Success


Unnamed: 0,data_clean_content,slang_replacement_content
0,udah sering belanja trs tapi setiap pengajuan ...,sudah sering belanja terus tapi setiap pengaj...
1,semenjak di upgrade shopee jadi lemot,semenjak di upgrade shopee jadi lemot
2,penyelesaian masalah sangat buruk,penyelesaian masalah sangat buruk
3,apk engga wajah malaikat jls,apk tidak wajah malaikat jls
4,lelet stress udah update terbaru tetap aja lemot,lelet stress sudah update terbaru tetap saja ...
...,...,...
8140,oke jempol ke atas jempol ke atas jempol ke at...,oke jempol ke atas jempol ke atas jempol ke at...
8141,tokopedia memang ok,tokopedia memang oke
8142,sangat membantu,sangat membantu
8143,tokopedia is the best,tokopedia is the best


In [4]:
# Tokenization
import nltk
import ast
from nltk.tokenize import word_tokenize

df = pd.read_csv("cleaned_marketPlacesData.csv")

def tokenize_text(text):
    if isinstance(text, str):
        return word_tokenize(text)
    return []

df["tokenized_content"] = df["slang_replacement_content"].apply(tokenize_text)
df.to_csv("cleaned_marketPlacesData.csv", index=False)

print("Tokenized Success")
df[["slang_replacement_content", "tokenized_content"]]

Tokenized Success


Unnamed: 0,slang_replacement_content,tokenized_content
0,sudah sering belanja terus tapi setiap pengaj...,"[sudah, sering, belanja, terus, tapi, setiap, ..."
1,semenjak di upgrade shopee jadi lemot,"[semenjak, di, upgrade, shopee, jadi, lemot]"
2,penyelesaian masalah sangat buruk,"[penyelesaian, masalah, sangat, buruk]"
3,apk tidak wajah malaikat jls,"[apk, tidak, wajah, malaikat, jls]"
4,lelet stress sudah update terbaru tetap saja ...,"[lelet, stress, sudah, update, terbaru, tetap,..."
...,...,...
8140,oke jempol ke atas jempol ke atas jempol ke at...,"[oke, jempol, ke, atas, jempol, ke, atas, jemp..."
8141,tokopedia memang oke,"[tokopedia, memang, oke]"
8142,sangat membantu,"[sangat, membantu]"
8143,tokopedia is the best,"[tokopedia, is, the, best]"


In [5]:
# Stop removal word

import ast
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# inisial stopword remover
factory = StopWordRemoverFactory()
stopwords = factory.get_stop_words()

# Hapus kata negasi dari daftar stopwords
negasi = ["tidak", "nggak", "jangan", "belum"]
stopwords = [word for word in stopwords if word not in negasi]

def remove_stopwords(tokens):
    return [t for t in tokens if t not in stopwords]

df["remove_stopwords_content"] = df["tokenized_content"].apply(remove_stopwords)
df.to_csv("cleaned_marketPlacesData.csv", index=False)

print("Stopword removal Success")
df[["tokenized_content", "remove_stopwords_content"]]

Stopword removal Success


Unnamed: 0,tokenized_content,remove_stopwords_content
0,"[sudah, sering, belanja, terus, tapi, setiap, ...","[sering, belanja, terus, pengajuan, tidak, lol..."
1,"[semenjak, di, upgrade, shopee, jadi, lemot]","[semenjak, upgrade, shopee, jadi, lemot]"
2,"[penyelesaian, masalah, sangat, buruk]","[penyelesaian, masalah, sangat, buruk]"
3,"[apk, tidak, wajah, malaikat, jls]","[apk, tidak, wajah, malaikat, jls]"
4,"[lelet, stress, sudah, update, terbaru, tetap,...","[lelet, stress, update, terbaru, tetap, lemot]"
...,...,...
8140,"[oke, jempol, ke, atas, jempol, ke, atas, jemp...","[oke, jempol, atas, jempol, atas, jempol, atas..."
8141,"[tokopedia, memang, oke]","[tokopedia, memang, oke]"
8142,"[sangat, membantu]","[sangat, membantu]"
8143,"[tokopedia, is, the, best]","[tokopedia, is, the, best]"


In [6]:
# Stemming - mengubah kata ke bentuk dasar

import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tqdm import tqdm                  # progress bar
from joblib import Parallel, delayed   # parallel processing
from joblib import Memory

memory = Memory(location='cachedir', verbose=0)

# inisialisasi stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

@memory.cache
def stemmed_wrapper(term):
    return stemmer.stem(term)

# Kumpulkan semua kata unik dari dataset
print("Mengumpulkan kata unik")
unique_terms = set()
for document in tqdm(df['remove_stopwords_content'], desc="Collecting terms"):
    if isinstance(document, str):
        unique_terms.update(document.split())  
    else:
        unique_terms.update(document)          

print("Melakukan stemming untuk semua kata unik")
results = Parallel(n_jobs=-1)(
    delayed(stemmed_wrapper)(term) for term in tqdm(unique_terms, desc="Stemming terms")
)

# buat dictionary hasil stemming
term_dict = dict(zip(unique_terms, results))

# Fungsi untuk stemming setiap dokumen
def stemmingText(document):
    if isinstance(document, str):
        terms = document.split()
    else:
        terms = document
    return [term_dict[term] for term in terms]

df["stemmed_content"] = df["remove_stopwords_content"].apply(stemmingText)
df.to_csv("cleaned_marketPlacesData.csv", index=False)

print("Stemming successful")
df[["remove_stopwords_content", "stemmed_content"]]

Mengumpulkan kata unik


Collecting terms: 100%|████████████████████████████████████████████████████████| 8145/8145 [00:00<00:00, 243183.11it/s]


Melakukan stemming untuk semua kata unik


Stemming terms: 100%|██████████████████████████████████████████████████████████████| 8975/8975 [07:01<00:00, 21.28it/s]


Stemming successful


Unnamed: 0,remove_stopwords_content,stemmed_content
0,"[sering, belanja, terus, pengajuan, tidak, lol...","[sering, belanja, terus, aju, tidak, lolos, te..."
1,"[semenjak, upgrade, shopee, jadi, lemot]","[semenjak, upgrade, shopee, jadi, lot]"
2,"[penyelesaian, masalah, sangat, buruk]","[selesai, masalah, sangat, buruk]"
3,"[apk, tidak, wajah, malaikat, jls]","[apk, tidak, wajah, malaikat, jls]"
4,"[lelet, stress, update, terbaru, tetap, lemot]","[lelet, stress, update, baru, tetap, lot]"
...,...,...
8140,"[oke, jempol, atas, jempol, atas, jempol, atas...","[oke, jempol, atas, jempol, atas, jempol, atas..."
8141,"[tokopedia, memang, oke]","[tokopedia, memang, oke]"
8142,"[sangat, membantu]","[sangat, bantu]"
8143,"[tokopedia, is, the, best]","[tokopedia, is, the, best]"
