In [82]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

import re
import os
from tqdm import tqdm
import joblib
from gensim.models import KeyedVectors

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, AdamW

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

In [27]:
df = pd.read_csv("/kaggle/input/deepseekreviews/datasets_reviews_clean.csv")
df_trans = pd.read_csv("/kaggle/input/deepseekreviews/datasets_reviews_augmented.csv")

# Skema 1 : DistilBERT(Transformer Model) + DistilBertTokenizerFast(Tokenizer)

In [3]:
df_trans.head()

Unnamed: 0,content,score,sentiment
0,I like this app it's really helpful and it's e...,5.0,positive
1,absolute beast.,5.0,positive
2,not fully good once start using it 😕,1.0,negative
3,"all time good app , it's great piece of mind 😉",5.0,positive
4,amazing apps. I love using this but it's hard ...,4.0,positive


In [4]:
df_trans['sentiment'].value_counts()

sentiment
positive    11033
negative     6785
neutral      6785
Name: count, dtype: int64

In [5]:
df_trans["content"] = df_trans["content"].str.lower()
df_trans["content"] = df_trans["content"].str.strip()
df_trans["content"] = df_trans["content"].apply(lambda x: re.sub(r'[^a-zA-Z0-9.,!?\' ]+', '', x))
df_trans["content"] = df_trans["content"].apply(lambda x: re.sub(r"http\S+|www\S+|@\S+", "", x))

In [6]:
sentiment_mapping = {
    'positive': 2,
    'neutral': 1,
    'negative': 0
}

In [7]:
df_trans['sentiment'] = df_trans['sentiment'].map(sentiment_mapping)

In [8]:
df_trans = df_trans.drop(columns=['score'], axis = 1)

In [9]:
df_trans.head()

Unnamed: 0,content,sentiment
0,i like this app it's really helpful and it's e...,2
1,absolute beast.,2
2,not fully good once start using it,0
3,"all time good app , it's great piece of mind",2
4,amazing apps. i love using this but it's hard ...,2


In [10]:
df_trans["content_length"] = df_trans["content"].apply(lambda x: len(x.split()))
df_trans["content_length"].describe()

count    24603.000000
mean        15.005446
std         16.381338
min          1.000000
25%          5.000000
50%          9.000000
75%         18.000000
max        112.000000
Name: content_length, dtype: float64

In [11]:
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [13]:
X = df_trans["content"].tolist()
y = df_trans["sentiment"].tolist()   

In [14]:
# Split 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [15]:
max_length = 32  # Sesuaikan dengan distribusi panjang teks dataset
encoded_train = tokenizer(X_train, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
encoded_test = tokenizer(X_test, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

In [16]:
# Convert ke TensorDataset
train_dataset = TensorDataset(encoded_train["input_ids"], encoded_train["attention_mask"], torch.tensor(y_train))
test_dataset = TensorDataset(encoded_test["input_ids"], encoded_test["attention_mask"], torch.tensor(y_test))

In [17]:
batch_size = 32  # Ubah sesuai kapasitas GPU
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [18]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2, weight=None):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.weight = weight

    def forward(self, logits, targets):
        ce_loss = F.cross_entropy(logits, targets, weight=self.weight, reduction="none")
        p_t = torch.exp(-ce_loss)  # Probabilitas prediksi benar
        focal_loss = ((1 - p_t) ** self.gamma) * ce_loss
        return focal_loss.mean()

In [19]:
class_weights = compute_class_weight(class_weight="balanced", classes=[0,1,2], y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

In [20]:
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = FocalLoss(gamma=3, weight=class_weights.to(device))
# criterion = FocalLoss(gamma=4)
# criterion = nn.CrossEntropyLoss()



In [21]:
num_epochs = 5  # Jumlah epoch
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct, total = 0, 0

    loop = tqdm(train_loader, leave=True)  # Progress bar
    for batch in loop:
        inputs, masks, labels = batch
        inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=inputs, attention_mask=masks).logits
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        # Logging
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

        # Update tqdm description
        loop.set_description(f"Epoch [{epoch+1}/{num_epochs}]")
        loop.set_postfix(loss=loss.item(), acc=100 * correct / total)

    print(f"Epoch {epoch+1} Loss: {running_loss/len(train_loader):.4f}, Accuracy: {100 * correct / total:.2f}%")

Epoch [1/5]: 100%|██████████| 616/616 [00:56<00:00, 10.91it/s, acc=83.4, loss=0.00249]


Epoch 1 Loss: 0.1034, Accuracy: 83.45%


Epoch [2/5]: 100%|██████████| 616/616 [01:03<00:00,  9.75it/s, acc=91.3, loss=0.00113]


Epoch 2 Loss: 0.0492, Accuracy: 91.35%


Epoch [3/5]: 100%|██████████| 616/616 [01:07<00:00,  9.18it/s, acc=95.3, loss=0.175]   


Epoch 3 Loss: 0.0251, Accuracy: 95.34%


Epoch [4/5]: 100%|██████████| 616/616 [01:06<00:00,  9.25it/s, acc=96.4, loss=0.00151] 


Epoch 4 Loss: 0.0180, Accuracy: 96.38%


Epoch [5/5]: 100%|██████████| 616/616 [01:06<00:00,  9.28it/s, acc=97.9, loss=8.72e-6] 

Epoch 5 Loss: 0.0092, Accuracy: 97.94%





In [22]:
def evaluate(model, dataloader, dataset_type="Test"):
    model.eval()
    correct, total = 0, 0
    all_preds, all_labels = [], []
    eval_loss = 0.0

    with torch.no_grad():
        for batch in dataloader:
            inputs, masks, labels = batch
            inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)

            outputs = model(input_ids=inputs, attention_mask=masks).logits
            loss = criterion(outputs, labels)
            eval_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = 100 * correct / total
    print(f"{dataset_type} Loss: {eval_loss/len(dataloader):.4f}, Accuracy: {accuracy:.2f}%")
    print(f"Classification Report ({dataset_type}):\n")
    print(classification_report(all_labels, all_preds, digits=4))


In [23]:
# Evaluasi untuk Train dan Test
evaluate(model, train_loader, dataset_type="Train")
evaluate(model, test_loader, dataset_type="Test")

Train Loss: 0.0057, Accuracy: 99.22%
Classification Report (Train):

              precision    recall  f1-score   support

           0     0.9853    0.9982    0.9917      5428
           1     0.9915    0.9838    0.9876      5428
           2     0.9969    0.9937    0.9953      8826

    accuracy                         0.9922     19682
   macro avg     0.9912    0.9919    0.9915     19682
weighted avg     0.9922    0.9922    0.9922     19682

Test Loss: 0.1086, Accuracy: 92.14%
Classification Report (Test):

              precision    recall  f1-score   support

           0     0.9095    0.9035    0.9065      1357
           1     0.8801    0.8924    0.8862      1357
           2     0.9545    0.9502    0.9523      2207

    accuracy                         0.9214      4921
   macro avg     0.9147    0.9153    0.9150      4921
weighted avg     0.9216    0.9214    0.9214      4921



In [24]:
# Simpan state_dict dari model
torch.save(model.state_dict(), '/kaggle/working/distilbert_sentiment_state_dict.pt')

# Simpan tokenizer HuggingFace
tokenizer.save_pretrained('/kaggle/working/tokenizer')

('/kaggle/working/tokenizer/tokenizer_config.json',
 '/kaggle/working/tokenizer/special_tokens_map.json',
 '/kaggle/working/tokenizer/vocab.txt',
 '/kaggle/working/tokenizer/added_tokens.json',
 '/kaggle/working/tokenizer/tokenizer.json')

# Skema 2 : Linear SVC vs Random Forest vs Logistic Regression (Machine Learning) + TF-IDF (Vectorizer)

In [28]:
df.head()

Unnamed: 0,cleaned_content,sentiment
0,like app really helpful easy use lot feature h...,2
1,absolute beast,2
2,not fully good start using,0
3,time good app great piece mind,2
4,amazing apps love using hard save long convers...,2


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24597 entries, 0 to 24602
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   cleaned_content  24597 non-null  object
 1   sentiment        24597 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 576.5+ KB


In [32]:
df = df.dropna()

In [48]:
df["content_length"] = df["cleaned_content"].apply(lambda x: len(x.split()))
df["content_length"].describe()

count    24597.000000
mean         8.477741
std          8.814165
min          1.000000
25%          3.000000
50%          5.000000
75%         10.000000
max         65.000000
Name: content_length, dtype: float64

In [35]:
X_ml = df['cleaned_content']
y_ml = df['sentiment']

In [36]:
tfidf = TfidfVectorizer(
    max_features=7000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.90
)

In [37]:
X_tfidf = tfidf.fit_transform(X_ml)

In [38]:
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_tfidf, y_ml, test_size=0.2, stratify=y_ml, random_state=42)

In [39]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Linear SVM": LinearSVC()
}

In [41]:
best_model = None
best_acc = 0

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_ml, y_train_ml)
    y_pred_ml = model.predict(X_test_ml)
    acc = accuracy_score(y_test_ml, y_pred_ml)
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test_ml, y_pred_ml, target_names=["Negative", "Neutral", "Positive"]))

    if acc > best_acc:
        best_acc = acc
        best_model = model
        best_model_name = name


Training Logistic Regression...
Logistic Regression Accuracy: 0.8624
              precision    recall  f1-score   support

    Negative       0.85      0.84      0.85      1357
     Neutral       0.78      0.82      0.80      1356
    Positive       0.92      0.91      0.91      2207

    accuracy                           0.86      4920
   macro avg       0.85      0.85      0.85      4920
weighted avg       0.86      0.86      0.86      4920


Training Random Forest...
Random Forest Accuracy: 0.8913
              precision    recall  f1-score   support

    Negative       0.87      0.87      0.87      1357
     Neutral       0.85      0.89      0.87      1356
    Positive       0.93      0.90      0.92      2207

    accuracy                           0.89      4920
   macro avg       0.88      0.89      0.89      4920
weighted avg       0.89      0.89      0.89      4920


Training Linear SVM...
Linear SVM Accuracy: 0.8764
              precision    recall  f1-score   support

   

In [43]:
print(f"\nModel terbaik: {best_model_name} dengan akurasi: {best_acc:.4f}")


Model terbaik: Random Forest dengan akurasi: 0.8913


# Skema 3 : BLSTM (Deep Learning) + Tensorflow Tokenizer (Tokenizer) + Embedding

In [46]:
X_dl = df['cleaned_content'].values
y_dl = df['sentiment'].values

In [47]:
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_dl, y_dl, test_size=0.2, stratify=y_dl, random_state=42)


In [49]:
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train_dl)

In [50]:
X_train_seq = tokenizer.texts_to_sequences(X_train_dl)
X_test_seq = tokenizer.texts_to_sequences(X_test_dl)


In [51]:
max_len = 32  # sesuai distribusi data kamu sebelumnya
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [81]:
!wget -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
!unzip -q wiki-news-300d-1M.vec.zip

In [83]:
embedding_dim = 300  # FastText default dim
fasttext_model = KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec')

word_index = tokenizer.word_index
num_words = min(max_words, len(word_index) + 1)

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= max_words:
        continue
    if word in fasttext_model:
        embedding_matrix[i] = fasttext_model[word]


In [86]:
embedding_dim = 300
lstm_units = 128
dropout_rate = 0.5
l2_reg = 1e-3
learning_rate = 1e-4

In [95]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=num_words,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=False  # Set True jika ingin fine-tune
    ),
    tf.keras.layers.SpatialDropout1D(0.2),                                   
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units, return_sequences=True)), 
    # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units//2, return_sequences=False)),
    tf.keras.layers.GlobalMaxPool1D(),                                       
    tf.keras.layers.BatchNormalization(),                                     
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=l2(l2_reg)),
    tf.keras.layers.Dropout(dropout_rate),
    tf.keras.layers.Dense(3, activation='softmax')
])

In [96]:
optimizer = Adam(learning_rate=learning_rate)

In [97]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [98]:
history = model.fit(
    X_train_pad, y_train_dl,
    epochs=50,
    batch_size=32,
    validation_data=(X_test_pad, y_test_dl),
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)],
    verbose=1
)

Epoch 1/50
[1m615/615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.5377 - loss: 1.1137 - val_accuracy: 0.7638 - val_loss: 0.6776
Epoch 2/50
[1m615/615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.7308 - loss: 0.7269 - val_accuracy: 0.8053 - val_loss: 0.5779
Epoch 3/50
[1m615/615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.7646 - loss: 0.6562 - val_accuracy: 0.8081 - val_loss: 0.5494
Epoch 4/50
[1m615/615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.7848 - loss: 0.6105 - val_accuracy: 0.8114 - val_loss: 0.5482
Epoch 5/50
[1m615/615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.7913 - loss: 0.5880 - val_accuracy: 0.8130 - val_loss: 0.5192
Epoch 6/50
[1m615/615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.8046 - loss: 0.5605 - val_accuracy: 0.8240 - val_loss: 0.5042
Epoch 7/50
[1m615/615[0m 

In [99]:
y_pred_probs = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred_probs, axis=1)

[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [100]:
print(classification_report(y_test_dl, y_pred_classes, digits=4))

              precision    recall  f1-score   support

           0     0.9016    0.8504    0.8752      1357
           1     0.8310    0.8665    0.8484      1356
           2     0.9232    0.9311    0.9271      2207

    accuracy                         0.8911      4920
   macro avg     0.8852    0.8827    0.8836      4920
weighted avg     0.8918    0.8911    0.8911      4920

