<a href="https://colab.research.google.com/github/TienNam97/phoBert_Sentiment_Analysis/blob/main/Sentiment_Analysis_VSFC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TRAIN

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive/')
os.chdir('/content/drive/My Drive/BERT/SA')

Mounted at /content/drive/


In [None]:
%%capture
!pip install --upgrade pip setuptools wheel
!pip install transformers
!pip install fastBPE
!pip install fairseq
!pip install underthesea

In [None]:
import numpy as np
import pandas as pd
import torch
import random

def seed_everything(seed_value):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed_everything(86)

In [None]:
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes',
    default="PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

# Load the dictionary
vocab = Dictionary()
vocab.add_from_file("PhoBERT_base_transformers/dict.txt")

In [None]:
# def preprocess(df):
#   for i in df.index:
#     if type(df['content'].at[i]) != str:
#       df = df.drop(i) #delete row
#   df = df.reset_index(drop=True)  #update index

#   return df

In [None]:
# import numpy as np
# import pandas as pd
# df = pd.read_csv('data.csv')
# df = preprocess(df)
# df = df.replace({'NEG': 0, 'POS': 2, 'NEU' : 1})
# df = df[['content','label']]
# df_train = df[:-2000]
# df_test = df[-2000:]
# df_test = df_test.reset_index()
# df_test = df_test.drop(['index'], axis=1)

In [None]:
import numpy as np
import pandas as pd
df_train = pd.read_csv('VSFC/train/sents.txt', sep=".", header=None, names = ['comment', 'label'])
df_train['label'] = pd.read_csv('VSFC/train/sentiments.txt', header=None)
df_train.loc[:,'comment'] = df_train['comment'].str.lower()

df_val = pd.read_csv('VSFC/dev/sents.txt', sep=".", header=None, names = ['comment', 'label'])
df_val['label'] = pd.read_csv('VSFC/dev/sentiments.txt', header=None)
df_val.loc[:,'comment'] = df_val['comment'].str.lower()

df_test = pd.read_csv('VSFC/test/sents.txt', sep=".", header=None, names = ['comment', 'label'])
df_test['label'] = pd.read_csv('VSFC/test/sentiments.txt', header=None)
df_test.loc[:,'comment'] = df_test['comment'].str.lower()

In [None]:
from underthesea import word_tokenize

train_text = []
train_labels = []
for i in df_train.index:
  train_text.append(word_tokenize(df_train.iloc[i]['comment'], format = 'text'))
  train_labels.append(df_train.iloc[i]['label'])

val_text = []
val_labels = []
for i in df_train.index:
  val_text.append(word_tokenize(df_train.iloc[i]['comment'], format = 'text'))
  val_labels.append(df_train.iloc[i]['label'])

test_text = []
test_labels = []
for i in df_test.index:
  test_text.append(word_tokenize(df_test.iloc[i]['comment'], format = 'text'))
  test_labels.append(df_test.iloc[i]['label'])


In [None]:
from sklearn.model_selection import train_test_split

# train_sents, val_sents, train_labels, val_labels = train_test_split(train_text, train_labels, test_size=0.2)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_LEN = 125

train_ids = []
for sent in train_text:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    train_ids.append(encoded_sent)

val_ids = []
for sent in val_text:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    val_ids.append(encoded_sent)

train_ids = pad_sequences(train_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
val_ids = pad_sequences(val_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

In [None]:
train_masks = []
for sent in train_ids:
    mask = [int(token_id > 0) for token_id in sent]
    train_masks.append(mask)

val_masks = []
for sent in val_ids:
    mask = [int(token_id > 0) for token_id in sent]

    val_masks.append(mask)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
train_inputs = torch.tensor(train_ids)
val_inputs = torch.tensor(val_ids)

train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = SequentialSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=32)

In [None]:
from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW

# config = RobertaConfig.from_pretrained(
#     "PhoBERT_base_transformers/config.json", from_tf=False, num_labels = 3, output_hidden_states=False,
# )
# BERT_SA = RobertaForSequenceClassification.from_pretrained(
#     "PhoBERT_base_transformers/model.bin",
#     config=config
# )

# config = RobertaConfig.from_pretrained(
#     "/content/drive/MyDrive/BERT/SA/bert_pretrain/config.json")
# BERT_SA = RobertaForSequenceClassification.from_pretrained(
#     "/content/drive/MyDrive/BERT/SA/bert_pretrain/",
#     config=config
# )

config = RobertaConfig.from_pretrained(
    "/content/drive/MyDrive/BERT/SA/bert_pretrain_fold/config.json")
BERT_SA = RobertaForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/BERT/SA/bert_pretrain_fold/",
    config=config
)

In [None]:
BERT_SA.cuda()
print('Done')

Done


In [None]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    F1_score = f1_score(pred_flat, labels_flat, average='weighted')

    return accuracy_score(pred_flat, labels_flat), F1_score

In [None]:
import random
from tqdm.notebook import tqdm
device = 'cuda'
epochs = 10

param_optimizer = list(BERT_SA.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)

best_eval_f1_score = 0
for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    total_loss = 0
    BERT_SA.train()
    train_accuracy = 0
    nb_train_steps = 0
    train_f1 = 0

    for step, batch in tqdm(enumerate(train_dataloader)):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        BERT_SA.zero_grad()
        outputs = BERT_SA(b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()

        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_train_accuracy, tmp_train_f1 = flat_accuracy(logits, label_ids)
        train_accuracy += tmp_train_accuracy
        train_f1 += tmp_train_f1
        nb_train_steps += 1

        loss.backward()
        torch.nn.utils.clip_grad_norm_(BERT_SA.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(" Accuracy: {0:.4f}".format(train_accuracy/nb_train_steps))
    print(" F1 score: {0:.4f}".format(train_f1/nb_train_steps))
    print(" Average training loss: {0:.4f}".format(avg_train_loss))

    print("Running Validation...")
    BERT_SA.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    eval_f1 = 0
    for batch in tqdm(val_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = BERT_SA(b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_accuracy, tmp_eval_f1 = flat_accuracy(logits, label_ids)

            eval_accuracy += tmp_eval_accuracy
            eval_f1 += tmp_eval_f1
            nb_eval_steps += 1

    print(" Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
    print(" F1 score: {0:.4f}".format(eval_f1/nb_eval_steps))

    if epoch_i == 0:
      BERT_SA.save_pretrained('bert_pretrain2/')
      print('Saved Pretrain!')
      best_eval_f1_score = eval_f1/nb_eval_steps
    else:
      if best_eval_f1_score < eval_f1/nb_eval_steps:
        BERT_SA.save_pretrained('bert_pretrain_fold_VSFC/')
        print('Update Saved Pretrain!')
        best_eval_f1_score = eval_f1/nb_eval_steps

print("Training complete!")

Training...




0it [00:00, ?it/s]

 Accuracy: 0.8925
 F1 score: 0.9051
 Average training loss: 0.3149
Running Validation...


  0%|          | 0/358 [00:00<?, ?it/s]

 Accuracy: 0.9397
 F1 score: 0.9458
Saved Pretrain!
Training...


0it [00:00, ?it/s]

 Accuracy: 0.9326
 F1 score: 0.9373
 Average training loss: 0.2116
Running Validation...


  0%|          | 0/358 [00:00<?, ?it/s]

 Accuracy: 0.9564
 F1 score: 0.9604
Update Saved Pretrain!
Training...


0it [00:00, ?it/s]

 Accuracy: 0.9467
 F1 score: 0.9488
 Average training loss: 0.1719
Running Validation...


  0%|          | 0/358 [00:00<?, ?it/s]

 Accuracy: 0.9680
 F1 score: 0.9695
Update Saved Pretrain!
Training...


0it [00:00, ?it/s]

 Accuracy: 0.9587
 F1 score: 0.9611
 Average training loss: 0.1461
Running Validation...


  0%|          | 0/358 [00:00<?, ?it/s]

 Accuracy: 0.9680
 F1 score: 0.9707
Update Saved Pretrain!
Training...


0it [00:00, ?it/s]

 Accuracy: 0.9658
 F1 score: 0.9670
 Average training loss: 0.1240
Running Validation...


  0%|          | 0/358 [00:00<?, ?it/s]

 Accuracy: 0.9771
 F1 score: 0.9792
Update Saved Pretrain!
Training...


0it [00:00, ?it/s]

 Accuracy: 0.9715
 F1 score: 0.9728
 Average training loss: 0.1056
Running Validation...


  0%|          | 0/358 [00:00<?, ?it/s]

 Accuracy: 0.9843
 F1 score: 0.9851
Update Saved Pretrain!
Training...


0it [00:00, ?it/s]

 Accuracy: 0.9768
 F1 score: 0.9777
 Average training loss: 0.0925
Running Validation...


  0%|          | 0/358 [00:00<?, ?it/s]

 Accuracy: 0.9850
 F1 score: 0.9863
Update Saved Pretrain!
Training...


0it [00:00, ?it/s]

 Accuracy: 0.9783
 F1 score: 0.9791
 Average training loss: 0.0819
Running Validation...


  0%|          | 0/358 [00:00<?, ?it/s]

 Accuracy: 0.9859
 F1 score: 0.9874
Update Saved Pretrain!
Training...


0it [00:00, ?it/s]

 Accuracy: 0.9818
 F1 score: 0.9829
 Average training loss: 0.0734
Running Validation...


  0%|          | 0/358 [00:00<?, ?it/s]

 Accuracy: 0.9906
 F1 score: 0.9912
Update Saved Pretrain!
Training...


0it [00:00, ?it/s]

 Accuracy: 0.9833
 F1 score: 0.9841
 Average training loss: 0.0623
Running Validation...


  0%|          | 0/358 [00:00<?, ?it/s]

 Accuracy: 0.9900
 F1 score: 0.9909
Training complete!


# LOAD

In [None]:
config2 = RobertaConfig.from_pretrained(
    "/content/drive/MyDrive/BERT/SA/bert_pretrain_fold_VSFC/config.json")
BERT_SA2 = RobertaForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/BERT/SA/bert_pretrain_fold_VSFC/",
    config=config2
)

In [None]:
BERT_SA2.cuda()
print('Done')

Done


In [None]:
def test(test_dataloader):
    test_loss, test_accuracy = 0, 0
    nb_test_steps, nb_test_examples = 0, 0
    test_f1 = 0
    for batch in tqdm(test_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = BERT_SA2(b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_test_accuracy, tmp_test_f1 = flat_accuracy(logits, label_ids)

            test_accuracy += tmp_test_accuracy
            test_f1 += tmp_test_f1
            nb_test_steps += 1

    print(" Accuracy: {0:.4f}".format(test_accuracy/nb_test_steps))
    print(" F1 score: {0:.4f}".format(test_f1/nb_test_steps))

In [None]:
def get_mask(data):
	masks = []
	for sen in data:
		mask = [int(token>0) for token in sen]
		masks.append(mask)
	return masks

In [None]:
test_ids = []
for sent in test_text:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    test_ids.append(encoded_sent)

test_ids = pad_sequences(test_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

test_inputs = torch.tensor(test_ids)

test_labels = torch.tensor(test_labels)

test_masks = get_mask(test_ids)

test_masks = torch.tensor(test_masks)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=32)

In [None]:
test(test_dataloader)

  0%|          | 0/99 [00:00<?, ?it/s]

 Accuracy: 0.9049
 F1 score: 0.9108


In [None]:
def predict(text):
  #test(test_dataloader)
  # while True:
  #   text = input("Nhập:")
  text = bpe.encode(' '.join(word_tokenize((text)[0], format = 'text')))
  encode_ = vocab.encode_line('<s> ' + text + ' </s>',append_eos=True, add_if_not_exist=False).long().tolist()
  encode_text = pad_sequences([encode_], maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

  test_masks = get_mask(encode_text)
  test_masks = torch.tensor(test_masks,dtype = torch.int64)
  test_inputs = torch.tensor(encode_text)

  test_inputs = test_inputs.to('cuda')
  test_masks = test_masks.to('cuda')
  with torch.no_grad():
    outputs = BERT_SA2(test_inputs, token_type_ids=None, attention_mask=test_masks)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    predict = np.argmax(logits)

    if predict == 0:
      cls = "Negative"
    elif predict == 2:
      cls = "Positive"
    elif predict == 1:
      cls = "Neutral"
    return cls


In [None]:
sentence = 'tạm chấp nhận'

In [None]:
predict(sentence)

'Neutral'