### Load data

In [1]:
import pandas as pd
df_train = pd.read_csv("../dataset/train_data.csv")
df_test = pd.read_csv("../dataset/test_data.csv")

train_text = df_train["berita"].tolist()
train_label = df_train["label"].tolist()
test_text = df_test["berita"].tolist()
test_label = df_test["label"].tolist()

In [2]:
# Model evaluation
from sklearn.metrics import accuracy_score, classification_report
# Vectorize the text
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)

### 1. Logistic Regression Model
-> Mencari hubungan feature(input) dalam diskrit dengan probabilitas hasil output diskrit tertentu. Memprediksi terjadi/tidak terjadinya suatu kejadian berdasarkan nilai prediktor.

In [3]:
from sklearn.linear_model import LogisticRegression

# Create and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, train_label)

# Predict on the test set
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(test_label, predictions)
report = classification_report(test_label, predictions)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.9068627450980392
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.91      0.91       599
           1       0.91      0.90      0.91       625

    accuracy                           0.91      1224
   macro avg       0.91      0.91      0.91      1224
weighted avg       0.91      0.91      0.91      1224



### 2. SVM (Support Vector Machine) Model
-> Mencari hyperplane terbaik dengan memaksimalkan jarak antar kelas. Support Vector merupakan data terluar paling dekat dengan hyperplane tersebut.

In [4]:
from sklearn.svm import SVC

# Create and train the SVM model
svm_model = SVC()
svm_model.fit(X_train, train_label)

# Predict on the test set
svm_predictions = svm_model.predict(X_test)

# Evaluate the model
svm_accuracy = accuracy_score(test_label, svm_predictions)
svm_report = classification_report(test_label, svm_predictions)

print(f"SVM Accuracy: {svm_accuracy}")
print("SVM Classification Report:")
print(svm_report)

SVM Accuracy: 0.9174836601307189
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.92      0.92       599
           1       0.93      0.91      0.92       625

    accuracy                           0.92      1224
   macro avg       0.92      0.92      0.92      1224
weighted avg       0.92      0.92      0.92      1224



### 3. Random Forest Model
-> Menggabungkan hasil beberapa decision tree untuk mencapai satu hasil

In [5]:
from sklearn.ensemble import RandomForestClassifier

# Create and train the Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, train_label)

# Predict on the test set
rf_predictions = rf_model.predict(X_test)

# Evaluate the model
rf_accuracy = accuracy_score(test_label, rf_predictions)
rf_report = classification_report(test_label, rf_predictions)

print(f"Random Forest Accuracy: {rf_accuracy}")
print("Random Forest Classification Report:")
print(rf_report)

Random Forest Accuracy: 0.863562091503268
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.90      0.87       599
           1       0.90      0.83      0.86       625

    accuracy                           0.86      1224
   macro avg       0.87      0.86      0.86      1224
weighted avg       0.87      0.86      0.86      1224



### 4. GBM(Gradient Boosting Machines) Model
-> Membangun model prediktif secara bertahap, biasanya dalam bentuk decision tree. Setiap iterasi GBM akan menambahkan model baru dan mengkoreksi prediksi sebelumnya

In [6]:
from sklearn.ensemble import GradientBoostingClassifier

# Create and train the GBM model
gbm_model = GradientBoostingClassifier()
gbm_model.fit(X_train, train_label)

# Predict on the test set
gbm_predictions = gbm_model.predict(X_test)

# Evaluate the model
gbm_accuracy = accuracy_score(test_label, gbm_predictions)
gbm_report = classification_report(test_label, gbm_predictions)

print(f"GBM Accuracy: {gbm_accuracy}")
print("GBM Classification Report:")
print(gbm_report)

GBM Accuracy: 0.8014705882352942
GBM Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.92      0.82       599
           1       0.90      0.68      0.78       625

    accuracy                           0.80      1224
   macro avg       0.82      0.80      0.80      1224
weighted avg       0.82      0.80      0.80      1224



### 5. LSTM(Long Short-Term Memory) Model
-> Merupakan Lapisan RNN (Recurrent Neural Network) yang mempelajari ketergantungan jangka panjang antara langkah waktu dalam data deret waktu dan urutan

In [7]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from keras.optimizers import Adam

# Tokenize the text
tokenizer = Tokenizer(num_words=512, split=' ')
tokenizer.fit_on_texts(train_text)
X_train_seq = tokenizer.texts_to_sequences(train_text)
X_test_seq = tokenizer.texts_to_sequences(test_text)

# Pad the sequences
X_train_pad = pad_sequences(X_train_seq)
X_test_pad = pad_sequences(X_test_seq, maxlen=X_train_pad.shape[1])

trainArr_label = np.array(train_label)
testArr_label = np.array(test_label)

# Create the LSTM model
model = Sequential()
model.add(Embedding(input_dim=512, output_dim=128, input_length=X_train_pad.shape[1]))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=2e-5), metrics=['accuracy'])

# Train the model
model.fit(X_train_pad, trainArr_label, epochs=5, batch_size=16, validation_data=(X_test_pad, testArr_label), verbose=2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, testArr_label, verbose=2)
print(f"LSTM Accuracy: {accuracy}")

Epoch 1/5
268/268 - 9s - loss: 0.6925 - accuracy: 0.5303 - val_loss: 0.6917 - val_accuracy: 0.5940 - 9s/epoch - 35ms/step
Epoch 2/5
268/268 - 5s - loss: 0.6912 - accuracy: 0.5581 - val_loss: 0.6904 - val_accuracy: 0.6078 - 5s/epoch - 20ms/step
Epoch 3/5
268/268 - 5s - loss: 0.6897 - accuracy: 0.6008 - val_loss: 0.6887 - val_accuracy: 0.6495 - 5s/epoch - 19ms/step
Epoch 4/5
268/268 - 5s - loss: 0.6879 - accuracy: 0.6265 - val_loss: 0.6866 - val_accuracy: 0.6773 - 5s/epoch - 19ms/step
Epoch 5/5
268/268 - 5s - loss: 0.6848 - accuracy: 0.6573 - val_loss: 0.6836 - val_accuracy: 0.7075 - 5s/epoch - 19ms/step
39/39 - 0s - loss: 0.6836 - accuracy: 0.7075 - 178ms/epoch - 5ms/step
LSTM Accuracy: 0.7075163125991821


In [None]:
# Save all the best results
results = {
    "Model": ["Logistic Regression", "SVM", "Random Forest", "GBM", "LSTM"],
    "Accuracy": [accuracy, svm_accuracy, rf_accuracy, gbm_accuracy, accuracy]
}

# Create a DataFrame to compare all the models
results_df = pd.DataFrame(results)

# Save the results to an Excel file
results_df.to_excel("../model/model_comparison.xlsx", index=False)