In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q transformers sentencepiece

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)


In [None]:
df = pd.read_csv("/content/drive/MyDrive/DataSet_PBL5/10000_Positive_Negative_Danh_Gia_Homestay.csv")
df = df[['text', 'intent']]

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['intent'])

train, test = train_test_split(df, test_size=0.1, random_state=SEED)
train, val = train_test_split(train, test_size=0.1, random_state=SEED)


In [None]:
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

def convert_data(df):
    input_examples = df.apply(lambda x: InputExample(guid=None,
                                                     text_a=x['text'],
                                                     text_b=None,
                                                     label=x['label']), axis=1)
    return input_examples

def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = []
    for e in examples:
        inputs = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length,
            truncation=True,
            padding='max_length'
        )
        input_ids, attention_mask = inputs["input_ids"], inputs["attention_mask"]
        features.append(InputFeatures(input_ids=input_ids,
                                      attention_mask=attention_mask,
                                      label=e.label))

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({'input_ids': tf.int32, 'attention_mask': tf.int32}, tf.int64),
        ({'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None])}, tf.TensorShape([])),
    )

train_data = convert_data(train)
val_data = convert_data(val)
test_data = convert_data(test)

train_dataset = convert_examples_to_tf_dataset(train_data, tokenizer).shuffle(100).batch(32)
val_dataset = convert_examples_to_tf_dataset(val_data, tokenizer).batch(32)
test_dataset = convert_examples_to_tf_dataset(test_data, tokenizer).batch(32)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
model.fit(train_dataset, validation_data=val_dataset, epochs=3)


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7ceeda43cc50>

In [None]:
preds = model.predict(test_dataset).logits
pred_labels = np.argmax(preds, axis=1)

true_labels = list(test['label'])
print(classification_report(true_labels, pred_labels, target_names=label_encoder.classes_))


              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       508
    positive       1.00      1.00      1.00       492

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



In [None]:

model_path = "/content/bert_intent_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


!zip -r bert_intent_model.zip bert_intent_model > /dev/null

from google.colab import files
files.download("bert_intent_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>