In [None]:
import re
import os
import string
import nltk
from datasets import load_dataset, DatasetDict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

def preprocess_dataset():
    raw_dataset = load_dataset("sobamchan/aclsum")
    
    def apply_cleaning(example):
        example["document"] = clean_text(example["document"])
        example["outcome"] = clean_text(example["outcome"])
        return example
    
    cleaned_dataset = raw_dataset.map(apply_cleaning)
    return cleaned_dataset

if __name__ == "__main__":
    dataset = preprocess_dataset()
    print(dataset)

    output_path = "../data/cleaned_aclsum"
    os.makedirs(output_path, exist_ok=True)
    dataset.save_to_disk(output_path)

    print(f"Dataset yang telah dibersihkan berhasil disimpan di: {output_path}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'challenge', 'approach', 'outcome'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['id', 'document', 'challenge', 'approach', 'outcome'],
        num_rows: 50
    })
    test: Dataset({
        features: ['id', 'document', 'challenge', 'approach', 'outcome'],
        num_rows: 100
    })
})


Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 18139.88 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 50/50 [00:00<00:00, 7672.04 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 14276.05 examples/s]

✅ Dataset yang telah dibersihkan berhasil disimpan di: ../data/cleaned_aclsum



