In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk

# Pastikan stopwords NLTK telah diunduh
nltk.download('stopwords')

# Inisialisasi Stopword dari Sastrawi dan NLTK
factory = StopWordRemoverFactory()
stop_words_sastrawi = set(factory.get_stop_words())
stop_words_nltk = set(stopwords.words('indonesian'))

# Stopwords manual tambahan
manual_stopwords = set([
    "rt", "shopee", "makinsayangshopee", "belanjadishopee", "shopeedihati"
])

# Path file stopwords tambahan
txt_stopword_path = r"D:\Tugas Akhir\Final\stopwords.txt"
txt_stopword = pd.read_csv(txt_stopword_path, names=["stopwords"], header=None)
txt_stopwords = set(txt_stopword["stopwords"][0].split())

# Gabungkan semua stopwords
all_stopwords = stop_words_sastrawi.union(stop_words_nltk).union(manual_stopwords).union(txt_stopwords)

# Fungsi untuk menghapus stopwords
def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in all_stopwords]

# Inisialisasi Stemmer
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

# Load Kamus Slang
slang_dictionary_path = r"D:\Tugas Akhir\Final\Kamus Slang.csv"
slang_df = pd.read_csv(slang_dictionary_path, delimiter=';', encoding='latin1')
slang_dict = dict(zip(slang_df['slang'], slang_df['normalisasi']))

# Fungsi untuk normalisasi slang sebelum tokenisasi
def normalize_slang(text):
    words = text.split()
    normalized_text = ' '.join([slang_dict.get(word, word) for word in words])
    return normalized_text

# Fungsi utama preprocessing
def preprocess_text(text, scenario=1):
    if not isinstance(text, str):
        return ""

    # Case Folding
    text = text.lower()

    # Hapus URL
    text = re.sub(r'http\S+|www\S+', '', text)

    # Ganti username
    text = re.sub(r'@\w+', '', text)

    # Ganti huruf berulang tiga kali atau lebih
    text = re.sub(r'(\w)\1{2,}', r'\1\1', text)

    # Hapus semua tanda baca
    text = re.sub(r'[^\w\s]', '', text)

    # Hapus karakter non-alfabet
    text = re.sub(r'[^a-z\s]', '', text)

    # Hapus karakter tunggal
    text = re.sub(r'\b[a-zA-Z]\b', '', text)

    # Normalisasi slang
    text = normalize_slang(text)

    # Tokenisasi
    tokens = word_tokenize(text)

    # Hapus stopwords
    tokens = remove_stopwords(tokens)

    # Gabungkan kembali token
    text = ' '.join(tokens)

    # Stemming jika skenario = 2
    if scenario == 2:
        text = stemmer.stem(text)

    return text.strip()

# Load Dataset
file_path = r"D:\Tugas Akhir\Final\tweetsfinal.csv"
df = pd.read_csv(file_path)

# Validasi Kolom 'tweet'
if 'tweet' not in df.columns:
    raise ValueError("Kolom 'tweet' tidak ditemukan dalam dataset.")

# Ganti label
df['label'] = df['label'].replace({"Extraversion": "Extroversion", "Ekstraversion": "Extroversion"})

# Hapus retweet
df = df[~df['tweet'].str.startswith('rt ', na=False)]

# Batasi jumlah tweet
df_limited = df.groupby(['username', 'label']).head(100).reset_index(drop=True)

# Gabungkan tweet per pengguna
df_grouped = df_limited.groupby(['username', 'label'])['tweet'].apply(lambda x: ' '.join(x)).reset_index()

# Terapkan Preprocessing untuk Setiap Skenario
output_paths = [
    r"D:\\Tugas Akhir\\Final\\skenario_1_final.csv",
    r"D:\\Tugas Akhir\\Final\\skenario_2_final.csv",
]

for scenario in range(1, 3):
    # Proses preprocessing
    df_grouped['processed_text'] = df_grouped['tweet'].apply(lambda x: preprocess_text(x, scenario=scenario))

    # Pilih kolom yang disimpan
    df_subset = df_grouped[['username', 'label', 'processed_text']]

    # Simpan hasil
    df_subset.to_csv(output_paths[scenario - 1], index=False)
    print(f"Skenario {scenario} berhasil disimpan ke: {output_paths[scenario - 1]}")

print("\nPreprocessing untuk semua skenario selesai.")