In [40]:
# Combine the dataset
import pandas as pd
import glob

files = glob.glob("dataset/*.csv")

# Take only the content column
all_data = []
for f in files:
    df = pd.read_csv(f)
    if "content" in df.columns:
        all_data.append(df["content"])

# Combine them into a one column
combined = pd.concat(all_data, ignore_index=True)

# Save to new CSV
combined.to_csv("marketPlacesData.csv", index=False, header=["content"])
print("new dataset has been created")

new dataset has been created


In [41]:
# Data Cleaning

import re
import emoji

df = pd.read_csv("marketPlacesData.csv")

def clean_text(text):
    if pd.isnull(text):
        return ""

    # Convert Emoji to Textual
    text = emoji.demojize(text, language="id")

    # remove special character
    text = re.sub(r'[^A-Za-z0-9\s]', ' ', text)

    # remove number
    text = re.sub(r'[0-9]+',' ', text)

    # remove multiple spacing
    text = re.sub(r"\s+", " ", text).strip()

    # Case Folding 
    text = text.lower()

    return text

df["data_clean"] = df["content"].apply(clean_text)
df.to_csv("cleaned_marketPlacesData.csv", index=False)

print("Data cleaning Success")

Data cleaning Success


In [42]:
# Slang replacement

# kamus bahasa gaul
slang_df = pd.read_csv("cleaning/slang_indo.csv")
slang_dict = dict(zip(slang_df.iloc[:,0], slang_df.iloc[:,1]))

def normalize_slang(text, slang_dict):
    if pd.isnull(text):
        return ""
    
    words = text.split()
    new_words = []
    
    for w in words:
        # cek apakah slang ada di kamus
        if w in slang_dict:
            new_words.append(slang_dict[w])
        else:
            new_words.append(w)
    
    return " ".join(new_words)

df = pd.read_csv("cleaned_marketPlacesData.csv")
df["slangWord_clean"] = df["data_clean"].apply(lambda x: normalize_slang(str(x), slang_dict))
df.to_csv("cleaned_marketPlacesData.csv", index=False)

print("Slang Replacement Success")

Slang Replacement Success
