In [1]:
import os, sys
sys.path.append('../')
os.chdir('../')

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, BertTokenizer
# from nltk.tokenize import TweetTokenizer

from utils.forward_fn import forward_sequence_classification
from utils.metrics import document_sentiment_metrics_fn
from utils.data_utils_kazee3 import DocumentSentimentDataset, DocumentSentimentDataLoader


In [2]:
print("Is cuda available?", torch.cuda.is_available())
print("Device count?", torch.cuda.device_count())
print("Current device?", torch.cuda.current_device())
print("Device name? ", torch.cuda.get_device_name(torch.cuda.current_device()))

Is cuda available? True
Device count? 4
Current device? 0
Device name?  NVIDIA RTX A5000


In [3]:
torch.cuda.set_device(2)

In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)


def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())


def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']


def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [5]:
set_seed(25072024)

## Load Model

In [6]:
# base_model = 'indobenchmark/indobert-lite-large-p1'
base_model = './dataset/emot_emotion-twitter/dataset_smsa_sentiments_kazee/save_model/sent_statement_llp2_240724'
model_save = './dataset/emot_emotion-twitter/dataset_smsa_sentiments_kazee/save_model/sent_statement_llp2_240724'

train_dataset_path = './dataset/emot_emotion-twitter/dataset_smsa_sentiments_kazee/data_train/train_data_statement.tsv'
valid_dataset_path = './dataset/emot_emotion-twitter/dataset_smsa_sentiments_kazee/data_train/valid_data_statement_baru.tsv'
test_dataset_path = './dataset/emot_emotion-twitter/dataset_smsa_sentiments_kazee/data_train/test_data_statement_baru.tsv'
# test_dataset_path = './dataset/ormrev/test_news.tsv'

In [7]:
train = pd.read_csv(train_dataset_path, sep='\t', header=None)
train.columns = ['statements', 'sentiment']
valid = pd.read_csv(valid_dataset_path, sep='\t', header=None)
valid.columns = ['statements', 'sentiment']
test = pd.read_csv(test_dataset_path, sep='\t', header=None)
test.columns = ['statements', 'sentiment']

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1256 entries, 0 to 1255
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   statements  1256 non-null   object
 1   sentiment   1256 non-null   object
dtypes: object(2)
memory usage: 19.8+ KB


In [9]:
valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   statements  157 non-null    object
 1   sentiment   157 non-null    object
dtypes: object(2)
memory usage: 2.6+ KB


In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   statements  158 non-null    object
 1   sentiment   158 non-null    object
dtypes: object(2)
memory usage: 2.6+ KB


In [11]:
tokenizer = BertTokenizer.from_pretrained(base_model)
config = AutoConfig.from_pretrained(base_model)
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = AutoModelForSequenceClassification.from_pretrained(
    base_model, config=config)

In [12]:
model

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=1024, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_featur

In [13]:
count_param(model)

17687043

In [14]:
train_dataset = DocumentSentimentDataset(
    train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(
    valid_dataset_path, tokenizer, lowercase=True)

In [24]:
train_loader = DocumentSentimentDataLoader(
    dataset=train_dataset, max_seq_len=512, batch_size=16, num_workers=2, shuffle=True)
valid_loader = DocumentSentimentDataLoader(
    dataset=valid_dataset, max_seq_len=512, batch_size=8, num_workers=2, shuffle=False)

In [16]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


## Train Model

In [25]:
optimizer = optim.Adam(model.parameters(), lr=2e-6)
model = model.cuda()

In [35]:
# Train
n_epochs = 10
best_f1 = 0
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)

    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(
            model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch + 1),
                                                                                   total_train_loss / (i + 1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch + 1),
                                                             total_train_loss / (i + 1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)

    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]
        loss, batch_hyp, batch_label = forward_sequence_classification(
            model, batch_data[:-1], i2w=i2w, device='cuda')

        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(
            total_loss / (i + 1), metrics_to_string(metrics)))

    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch + 1),
                                                   total_loss / (i + 1), metrics_to_string(metrics)))

    if metrics['F1'] > best_f1:
        best_f1 = metrics['F1']
        model.save_pretrained(model_save)
        tokenizer.save_pretrained(model_save)
        config.save_pretrained(model_save)

        print('current best')

(Epoch 1) TRAIN LOSS:0.0269 LR:0.00000200: 100%|██████████| 314/314 [00:39<00:00,  7.86it/s]


(Epoch 1) TRAIN LOSS:0.0269 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.8293 ACC:0.83 F1:0.82 REC:0.82 PRE:0.83: 100%|██████████| 20/20 [00:01<00:00, 11.98it/s]


(Epoch 1) VALID LOSS:0.8293 ACC:0.83 F1:0.82 REC:0.82 PRE:0.83
current best


(Epoch 2) TRAIN LOSS:0.0211 LR:0.00000200: 100%|██████████| 314/314 [00:36<00:00,  8.63it/s]


(Epoch 2) TRAIN LOSS:0.0211 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.8866 ACC:0.79 F1:0.79 REC:0.81 PRE:0.81: 100%|██████████| 20/20 [00:01<00:00, 11.15it/s]


(Epoch 2) VALID LOSS:0.8866 ACC:0.79 F1:0.79 REC:0.81 PRE:0.81


(Epoch 3) TRAIN LOSS:0.0204 LR:0.00000200: 100%|██████████| 314/314 [00:33<00:00,  9.37it/s]


(Epoch 3) TRAIN LOSS:0.0204 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.7094 ACC:0.83 F1:0.82 REC:0.82 PRE:0.84: 100%|██████████| 20/20 [00:01<00:00, 11.23it/s]


(Epoch 3) VALID LOSS:0.7094 ACC:0.83 F1:0.82 REC:0.82 PRE:0.84
current best


(Epoch 4) TRAIN LOSS:0.0205 LR:0.00000200: 100%|██████████| 314/314 [00:35<00:00,  8.74it/s]


(Epoch 4) TRAIN LOSS:0.0205 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.6510 ACC:0.83 F1:0.83 REC:0.82 PRE:0.84: 100%|██████████| 20/20 [00:02<00:00,  8.30it/s]


(Epoch 4) VALID LOSS:0.6510 ACC:0.83 F1:0.83 REC:0.82 PRE:0.84
current best


(Epoch 5) TRAIN LOSS:0.0187 LR:0.00000200: 100%|██████████| 314/314 [00:35<00:00,  8.97it/s]


(Epoch 5) TRAIN LOSS:0.0187 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.7724 ACC:0.80 F1:0.81 REC:0.82 PRE:0.82: 100%|██████████| 20/20 [00:01<00:00, 10.59it/s]


(Epoch 5) VALID LOSS:0.7724 ACC:0.80 F1:0.81 REC:0.82 PRE:0.82


(Epoch 6) TRAIN LOSS:0.0193 LR:0.00000200: 100%|██████████| 314/314 [00:35<00:00,  8.83it/s]


(Epoch 6) TRAIN LOSS:0.0193 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.6824 ACC:0.83 F1:0.83 REC:0.83 PRE:0.83: 100%|██████████| 20/20 [00:01<00:00, 15.00it/s]


(Epoch 6) VALID LOSS:0.6824 ACC:0.83 F1:0.83 REC:0.83 PRE:0.83
current best


(Epoch 7) TRAIN LOSS:0.0204 LR:0.00000200: 100%|██████████| 314/314 [00:38<00:00,  8.13it/s]


(Epoch 7) TRAIN LOSS:0.0204 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.6581 ACC:0.87 F1:0.86 REC:0.87 PRE:0.86: 100%|██████████| 20/20 [00:02<00:00,  9.59it/s]


(Epoch 7) VALID LOSS:0.6581 ACC:0.87 F1:0.86 REC:0.87 PRE:0.86
current best


(Epoch 8) TRAIN LOSS:0.0177 LR:0.00000200: 100%|██████████| 314/314 [00:32<00:00,  9.54it/s]


(Epoch 8) TRAIN LOSS:0.0177 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.7620 ACC:0.85 F1:0.85 REC:0.86 PRE:0.84: 100%|██████████| 20/20 [00:02<00:00,  8.65it/s]


(Epoch 8) VALID LOSS:0.7620 ACC:0.85 F1:0.85 REC:0.86 PRE:0.84


(Epoch 9) TRAIN LOSS:0.0186 LR:0.00000200: 100%|██████████| 314/314 [00:33<00:00,  9.43it/s]


(Epoch 9) TRAIN LOSS:0.0186 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.6683 ACC:0.86 F1:0.86 REC:0.86 PRE:0.85: 100%|██████████| 20/20 [00:01<00:00, 10.95it/s]


(Epoch 9) VALID LOSS:0.6683 ACC:0.86 F1:0.86 REC:0.86 PRE:0.85


(Epoch 10) TRAIN LOSS:0.0191 LR:0.00000200: 100%|██████████| 314/314 [00:34<00:00,  8.97it/s]


(Epoch 10) TRAIN LOSS:0.0191 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.6872 ACC:0.85 F1:0.84 REC:0.85 PRE:0.84: 100%|██████████| 20/20 [00:02<00:00,  9.77it/s]

(Epoch 10) VALID LOSS:0.6872 ACC:0.85 F1:0.84 REC:0.85 PRE:0.84





In [26]:
from sklearn.metrics import classification_report

# Fungsi untuk menghitung dan menampilkan classification report
def display_classification_report(list_hyp, list_label):
    list_hyp_idx = [DocumentSentimentDataset.LABEL2INDEX[hyp] for hyp in list_hyp]
    list_label_idx = [DocumentSentimentDataset.LABEL2INDEX[label] for label in list_label]
    
    target_names = [DocumentSentimentDataset.INDEX2LABEL[i] for i in range(DocumentSentimentDataset.NUM_LABELS)]
    
    print("\nClassification Report:")
    print(classification_report(list_label_idx, list_hyp_idx, target_names=target_names))

# Train
n_epochs = 10
best_f1 = 0

for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)

    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss += tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch + 1),
                                                                                   total_train_loss / (i + 1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch + 1),
                                                             total_train_loss / (i + 1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)

    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Calculate total loss
        valid_loss = loss.item()
        total_loss += valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(
            total_loss / (i + 1), metrics_to_string(metrics)))

    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch + 1),
                                                   total_loss / (i + 1), metrics_to_string(metrics)))

    if metrics['F1'] > best_f1:
        best_f1 = metrics['F1']
        model.save_pretrained(model_save)
        tokenizer.save_pretrained(model_save)
        config.save_pretrained(model_save)

        print('current best')
        # Display classification report for the current best model
        display_classification_report(list_hyp, list_label)


(Epoch 1) TRAIN LOSS:0.0191 LR:0.00000200: 100%|██████████| 79/79 [00:25<00:00,  3.09it/s]


(Epoch 1) TRAIN LOSS:0.0191 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.7894 ACC:0.83 F1:0.82 REC:0.82 PRE:0.83: 100%|██████████| 20/20 [00:02<00:00,  9.69it/s]


(Epoch 1) VALID LOSS:0.7894 ACC:0.83 F1:0.82 REC:0.82 PRE:0.83
current best

Classification Report:
              precision    recall  f1-score   support

    positive       0.85      0.90      0.87        73
     neutral       0.76      0.74      0.75        42
    negative       0.89      0.81      0.85        42

    accuracy                           0.83       157
   macro avg       0.83      0.82      0.82       157
weighted avg       0.84      0.83      0.83       157



(Epoch 2) TRAIN LOSS:0.0191 LR:0.00000200: 100%|██████████| 79/79 [00:23<00:00,  3.36it/s]


(Epoch 2) TRAIN LOSS:0.0191 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.8080 ACC:0.83 F1:0.82 REC:0.82 PRE:0.82: 100%|██████████| 20/20 [00:01<00:00, 10.59it/s]


(Epoch 2) VALID LOSS:0.8080 ACC:0.83 F1:0.82 REC:0.82 PRE:0.82


(Epoch 3) TRAIN LOSS:0.0175 LR:0.00000200: 100%|██████████| 79/79 [00:24<00:00,  3.29it/s]


(Epoch 3) TRAIN LOSS:0.0175 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.8380 ACC:0.82 F1:0.81 REC:0.81 PRE:0.82: 100%|██████████| 20/20 [00:01<00:00, 14.50it/s]


(Epoch 3) VALID LOSS:0.8380 ACC:0.82 F1:0.81 REC:0.81 PRE:0.82


(Epoch 4) TRAIN LOSS:0.0182 LR:0.00000200: 100%|██████████| 79/79 [00:23<00:00,  3.32it/s]


(Epoch 4) TRAIN LOSS:0.0182 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.8313 ACC:0.83 F1:0.82 REC:0.82 PRE:0.82: 100%|██████████| 20/20 [00:02<00:00,  8.52it/s]


(Epoch 4) VALID LOSS:0.8313 ACC:0.83 F1:0.82 REC:0.82 PRE:0.82


(Epoch 5) TRAIN LOSS:0.0172 LR:0.00000200: 100%|██████████| 79/79 [00:23<00:00,  3.38it/s]


(Epoch 5) TRAIN LOSS:0.0172 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.8829 ACC:0.82 F1:0.82 REC:0.83 PRE:0.82: 100%|██████████| 20/20 [00:02<00:00,  9.69it/s]


(Epoch 5) VALID LOSS:0.8829 ACC:0.82 F1:0.82 REC:0.83 PRE:0.82


(Epoch 6) TRAIN LOSS:0.0171 LR:0.00000200: 100%|██████████| 79/79 [00:23<00:00,  3.31it/s]


(Epoch 6) TRAIN LOSS:0.0171 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.8455 ACC:0.83 F1:0.82 REC:0.81 PRE:0.83: 100%|██████████| 20/20 [00:01<00:00, 10.70it/s]


(Epoch 6) VALID LOSS:0.8455 ACC:0.83 F1:0.82 REC:0.81 PRE:0.83


(Epoch 7) TRAIN LOSS:0.0180 LR:0.00000200: 100%|██████████| 79/79 [00:23<00:00,  3.34it/s]


(Epoch 7) TRAIN LOSS:0.0180 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.8865 ACC:0.82 F1:0.82 REC:0.83 PRE:0.82: 100%|██████████| 20/20 [00:01<00:00, 10.54it/s]


(Epoch 7) VALID LOSS:0.8865 ACC:0.82 F1:0.82 REC:0.83 PRE:0.82


(Epoch 8) TRAIN LOSS:0.0165 LR:0.00000200: 100%|██████████| 79/79 [00:23<00:00,  3.37it/s]


(Epoch 8) TRAIN LOSS:0.0165 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.9744 ACC:0.80 F1:0.80 REC:0.81 PRE:0.80: 100%|██████████| 20/20 [00:01<00:00, 14.44it/s]


(Epoch 8) VALID LOSS:0.9744 ACC:0.80 F1:0.80 REC:0.81 PRE:0.80


(Epoch 9) TRAIN LOSS:0.0175 LR:0.00000200: 100%|██████████| 79/79 [00:23<00:00,  3.41it/s]


(Epoch 9) TRAIN LOSS:0.0175 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.8405 ACC:0.82 F1:0.81 REC:0.81 PRE:0.82: 100%|██████████| 20/20 [00:02<00:00,  9.21it/s]


(Epoch 9) VALID LOSS:0.8405 ACC:0.82 F1:0.81 REC:0.81 PRE:0.82


(Epoch 10) TRAIN LOSS:0.0176 LR:0.00000200: 100%|██████████| 79/79 [00:23<00:00,  3.33it/s]


(Epoch 10) TRAIN LOSS:0.0176 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000200


VALID LOSS:0.8639 ACC:0.83 F1:0.82 REC:0.82 PRE:0.83: 100%|██████████| 20/20 [00:01<00:00, 12.79it/s]


(Epoch 10) VALID LOSS:0.8639 ACC:0.83 F1:0.82 REC:0.82 PRE:0.83


In [38]:
df_results.to_csv('dataset/emot_emotion-twitter/dataset_smsa_sentiments_kazee/data_train/valid_data_statement_baru_results.csv', index=False)

## Test Model for Sentence

In [39]:
text = 'polri bertugas secara profesional dan tak pandang bulu.'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(
    f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: polri bertugas secara profesional dan tak pandang bulu. | Label : positive (99.949%)


In [40]:
text = 'semuanya masih proses awal belum masuk dalam rapat dpp dan belum dilaporkan kepada ibu ketua umum'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(
    f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: semuanya masih proses awal belum masuk dalam rapat dpp dan belum dilaporkan kepada ibu ketua umum | Label : neutral (99.986%)


In [41]:
text = 'enggak ada ancaman, cuman dibilangnya percuma punya teman punya saudara jadi pj gubernur, tapi gak ada gunanya'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(
    f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')



Text: enggak ada ancaman, cuman dibilangnya percuma punya teman punya saudara jadi pj gubernur, tapi gak ada gunanya | Label : negative (99.972%)


In [21]:
#save_model = './dataset/emot_emotion-twitter/dataset_smsa_sentiments_kazee/save_model/sent_statement_llp2_240724_acc86'
model.save_pretrained(model_save)
tokenizer.save_pretrained(model_save)
config.save_pretrained(model_save)

## TEST FOR DATA TEST_SENTIMENT