# Preprocessing dan Reduksi Data

**Preprocessing**

In [None]:
!pip install Sastrawi
!pip install nltk

import pandas as pd
import re
import nltk
from google.colab import files
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

pd.set_option('display.max_rows', 10)

# Upload file
uploaded = files.upload()

# Baca file CSV
df = pd.read_csv('ulasan1_thread.csv')

# Ambil hanya kolom 'content'
df = df[['content']].copy()

# 1. Cleaning text
def clean_text(text):
    text = str(text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)  # Hapus simbol & angka
    text = re.sub(r'\s+', ' ', text)          # Hapus spasi berlebih
    return text.strip()

df['cleaned'] = df['content'].apply(clean_text)

# 2. Case folding (ubah ke huruf kecil)
df['case_folding'] = df['cleaned'].str.lower()

# 3. Tokenisasi
nltk.download('punkt')  # yang benar, bukan 'punkt_tab'
df['tokens'] = df['case_folding'].apply(word_tokenize)

# 4. Stopword removal
nltk.download('stopwords')
stop_words_indonesia = set(stopwords.words('indonesian'))
stop_words_english = set(stopwords.words('english'))
stop_words = stop_words_indonesia.union(stop_words_english)

df['filtered'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

# 5. Stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()
df['stemmed'] = df['filtered'].apply(lambda x: [stemmer.stem(word) for word in x])

# Optional: hapus data kosong & reset index
df = df.dropna().reset_index(drop=True)

# Tampilkan hasil
display(df[['content', 'cleaned', 'case_folding', 'tokens', 'filtered', 'stemmed']])

# Simpan ke file CSV
df.to_csv('preprocessed1_thread.csv', index=False)



Saving ulasan1_thread.csv to ulasan1_thread (8).csv


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,content,cleaned,case_folding,tokens,filtered,stemmed
0,"apk anti toxic sejauh ini, nyaman bgt make nyaa",apk anti toxic sejauh ini nyaman bgt make nyaa,apk anti toxic sejauh ini nyaman bgt make nyaa,"[apk, anti, toxic, sejauh, ini, nyaman, bgt, m...","[apk, anti, toxic, nyaman, bgt, make, nyaa]","[apk, anti, toxic, nyaman, bgt, make, nyaa]"
1,sukses selalu,sukses selalu,sukses selalu,"[sukses, selalu]",[sukses],[sukses]
2,baik,baik,baik,[baik],[],[]
3,mantap..... bossss,mantap bossss,mantap bossss,"[mantap, bossss]","[mantap, bossss]","[mantap, bossss]"
4,ok,ok,ok,[ok],[ok],[ok]
...,...,...,...,...,...,...
495,oke,oke,oke,[oke],[oke],[oke]
496,jelekk,jelekk,jelekk,[jelekk],[jelekk],[jelekk]
497,Menarik,Menarik,menarik,[menarik],[menarik],[tarik]
498,masalah dibagian update bio. mohon diperbaiki,masalah dibagian update bio mohon diperbaiki,masalah dibagian update bio mohon diperbaiki,"[masalah, dibagian, update, bio, mohon, diperb...","[dibagian, update, bio, mohon, diperbaiki]","[bagi, update, bio, mohon, baik]"


**Reduksi Data (Content=Stemmed)**

In [None]:
# Install library
!pip install Sastrawi
!pip install nltk

import pandas as pd
import re
import nltk
from google.colab import files
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Upload file CSV
uploaded = files.upload()

# Baca file CSV dan biarkan semua kolom
df = pd.read_csv('ulasan1_thread.csv')
print("Kolom tersedia:", df.columns.tolist())  # Tambahan: lihat kolom yang tersedia

# 1. Cleaning
def clean_text(text):
    text = str(text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['cleaned'] = df['content'].apply(clean_text)

# 2. Case folding
df['case_folding'] = df['cleaned'].str.lower()

# 3. Tokenisasi
nltk.download('punkt')
df['tokens'] = df['case_folding'].apply(word_tokenize)

# 4. Stopword removal
nltk.download('stopwords')
stop_words_indonesia = set(stopwords.words('indonesian'))
stop_words_english = set(stopwords.words('english'))
stop_words = stop_words_indonesia.union(stop_words_english)
df['filtered'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

# 5. Stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()
df['stemmed'] = df['filtered'].apply(lambda x: [stemmer.stem(word) for word in x])

# Gabungkan hasil stemming
df['stemmed_joined'] = df['stemmed'].apply(lambda x: ' '.join(x))

# Ganti isi kolom content
df['content'] = df['stemmed_joined']

# Ambil kolom yang diperlukan (pastikan kolom-kolom ini benar-benar ada!)
df_reduced = df[['userName', 'score', 'content', 'at']].copy()

# Tampilkan dan simpan
display(df_reduced[['userName', 'score', 'content', 'at']])
df_reduced.to_csv('preprocessed_reduced3.csv', index=False)



Saving ulasan1_thread.csv to ulasan1_thread (15).csv
Kolom tersedia: ['reviewId', 'userName', 'userImage', 'content', 'score', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt', 'appVersion']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,userName,score,content,at
0,Fania,5,apk anti toxic nyaman bgt make nyaa,2025-05-14 07:21:10
1,Zay Coustom,5,sukses,2025-05-14 06:28:49
2,Ngatno Byl,4,,2025-05-14 05:48:18
3,cici Puspita,2,mantap bossss,2025-05-14 05:05:44
4,Joko Maryanto,5,ok,2025-05-14 03:48:56
...,...,...,...,...
495,Ridwan Zulmahendra,5,oke,2025-04-28 12:03:40
496,Kris Mulianto,1,jelekk,2025-04-28 11:48:55
497,Tambuntua Samosir,5,tarik,2025-04-28 11:29:44
498,D poipoipoiku,2,bagi update bio mohon baik,2025-04-28 11:26:52
