In [None]:
# import library
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from wordcloud import WordCloud

In [None]:
# membaca dataset file csv
df = pd.read_csv('/content/drive/MyDrive/dataset/dataset.csv')

#fungsi emoji removal
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"  # symbols
                           u"\U000024C2-\U0001F251"  # pictographs
                           u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                           u"\U0001F1F2-\U0001F1F4"  # Enclosed Characters
                           u"\U0001F1E6-\U0001F1FF"  # Regional Indicator Symbols
                           u"\U0001F600-\U0001F636"  # Emoticons
                           u"\U0001F681-\U0001F6C5"  # Transport and Map Symbols
                           u"\U0001F30D-\U0001F567"  # Miscellaneous Symbols and Pictographs
                           u"\U0001F680-\U0001F6C0"  # Transport and Map Symbols
                           u"\U0001F600-\U0001F64F"  # Emoticons
                           u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                           u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                           u"\U0001F1E0-\U0001F1FF"  # Flags (iOS)
                           "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)

# fungsi untuk melakukan case folding, puctuation removal, emoji removal dan number removal
def preprocess_text(text):
    # Case folding
    text = text.lower()
    # Punctuation removal
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    # Emoji removal
    text = remove_emoji(text)
    # Number removal
    text = re.sub(r'\d+', '', text)
    return text

# Melakukan preprocessing pada kolom teks
df['text'] = df['text'].apply(preprocess_text)

# menyimpan dataframe ke file csv
df.to_csv('/content/drive/MyDrive/dataset/datasetclear.csv', index=False)

In [None]:
# Membaca dataset
df = pd.read_csv('/content/drive/MyDrive/dataset/datasetclear.csv')
X = df['text']
y = df['label']

In [None]:
# nas N-gram 80:20 Soft Voting
# Feature extraction dengan n-gram
cv = CountVectorizer(ngram_range=(1,2))
X_train_cv = cv.fit_transform(X)

# Split data menjadi train dan test set
X_train, X_test, y_train, y_test = train_test_split(X_train_cv, y, test_size=0.2, random_state=22)

# Inisialisasi classifier
svm = LinearSVC(max_iter=10000)
nb = MultinomialNB()
dt = DecisionTreeClassifier()

# Ensemble dengan bagging
svm_bagging = BaggingClassifier(estimator=svm, n_estimators=33, random_state=22)
nb_bagging = BaggingClassifier(estimator=nb, n_estimators=33, random_state=22)
dt_bagging = BaggingClassifier(estimator=dt, n_estimators=33, random_state=22)

# Ensemble dengan voting
ensemble_nas = VotingClassifier(estimators=[('svm', svm_bagging), ('nb', nb_bagging), ('dt', dt_bagging)], voting='soft')

# Train model dan lakukan prediksi
ensemble_nas.fit(X_train, y_train)
y_pred = ensemble_nas.predict(X_test)

# Print classification report
print("Ensemble Bagging:\n", classification_report(y_test, y_pred, digits=4))