**Library**

In [18]:
import sys
import os
import yaml
import json
import pandas as pd
from sklearn_crfsuite import metrics

In [19]:
sys.path.append(os.path.abspath(os.path.join('..')))
from src.features.hand_crafted import sent2features, sent2labels, sent2tokens
from src.models.machine_learning import CRFModel, FlatModelWrapper
from src.data_loader.dataset import convert_label_studio_to_ner_data

**Load Config**

In [20]:
with open('../configs/ml_config.yaml', 'r') as f:
    config = yaml.safe_load(f)

**Load Data**

In [21]:
print("Loading data files...")
TRAIN_PATH = '../data/04_model_input/train_dataset.json'
TEST_PATH = '../data/04_model_input/test_dataset.json'
DEV_PATH = '../data/04_model_input/dev_dataset.json'

with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
    train_json = json.load(f)

with open(TEST_PATH, 'r', encoding='utf-8') as f:
    test_json = json.load(f)

with open(DEV_PATH, 'r', encoding='utf-8') as f:
    dev_json = json.load(f)

print(f"-> Train raw docs: {len(train_json)}")
print(f"-> Test raw docs: {len(test_json)}")
print(f"-> Dev raw docs: {len(dev_json)}")

Loading data files...
-> Train raw docs: 797
-> Test raw docs: 114
-> Dev raw docs: 228


In [22]:
print("\nConverting to BIO format...")
train_sents = convert_label_studio_to_ner_data(train_json)
test_sents = convert_label_studio_to_ner_data(test_json)
dev_sents = convert_label_studio_to_ner_data(dev_json)


Converting to BIO format...


Converting with Pyvi: 100%|██████████| 797/797 [00:01<00:00, 749.46it/s]
Converting with Pyvi: 100%|██████████| 114/114 [00:00<00:00, 762.88it/s]
Converting with Pyvi: 100%|██████████| 228/228 [00:00<00:00, 712.78it/s]


In [23]:
train_sents[0][:20]

[('Bán', 'O'),
 ('lô', 'B-TYPE'),
 ('đất', 'I-TYPE'),
 ('100m²', 'B-AREA'),
 ('ngay', 'O'),
 ('cạnh', 'O'),
 ('nhà', 'O'),
 ('thi_đấu', 'B-ATTR'),
 ('Thủ_Thừa', 'I-ATTR'),
 (',', 'O'),
 ('view', 'B-ATTR'),
 ('hồ', 'I-ATTR'),
 ('sinh_thái', 'I-ATTR'),
 ('mát_mẻ', 'I-ATTR'),
 (',', 'O'),
 ('giá', 'O'),
 ('mềm', 'O'),
 (',', 'O'),
 ('xây_dựng', 'O'),
 ('ở', 'O')]

**Extracting features**

In [24]:
print("\nExtracting features")
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

print(f"-> X_train shape: {len(X_train)} sentences")
print(f"-> X_test shape: {len(X_test)} sentences")


Extracting features
-> X_train shape: 797 sentences
-> X_test shape: 114 sentences


In [25]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n{'='*20} EVALUATION: {model_name} {'='*20}")
    
    # Lấy danh sách nhãn thực tế (trừ O để F1-score phản ánh đúng thực thể)
    labels = list(set([l for sent in y_true for l in sent]))
    if 'O' in labels: labels.remove('O')
    labels = sorted(labels)
    
    print(f"Accuracy: {metrics.flat_accuracy_score(y_true, y_pred):.4f}")
    print(metrics.flat_classification_report(
        y_true, y_pred, labels=labels, digits=4
    ))

**CRF (Conditional Random Fields)**

In [26]:
print("Training CRF model")
crf = CRFModel(config.get('crf', {}))
crf.train(X_train, y_train)

Training CRF model


In [27]:
# Predict & Evaluate
y_pred_crf = crf.predict(X_test)
evaluate_model(y_test, y_pred_crf, "CRF")


Accuracy: 0.7199
              precision    recall  f1-score   support

      B-AREA     0.6285    0.6136    0.6209       295
      B-ATTR     0.5905    0.4856    0.5329       900
       B-LOC     0.5942    0.6253    0.6094       363
         B-O     0.1613    0.1250    0.1408        40
       B-ORG     0.6667    0.1579    0.2553        38
       B-PER     0.3750    0.2432    0.2951        37
     B-PRICE     0.7670    0.8438    0.8036       160
      B-TYPE     0.6389    0.5779    0.6069       199
      I-AREA     0.7290    0.6431    0.6834       297
      I-ATTR     0.5584    0.5054    0.5306      1296
       I-LOC     0.5817    0.6158    0.5983       393
         I-O     0.1471    0.0806    0.1042        62
       I-ORG     0.7368    0.6087    0.6667        23
       I-PER     0.3750    0.2222    0.2791        27
     I-PRICE     0.7143    0.8789    0.7881       256
      I-TYPE     0.3801    0.4333    0.4050       150

   micro avg     0.5944    0.5551    0.5741      4536
   macro

In [28]:
# OUTPUT_MODEL_DIR = '../outputs/models' 
# os.makedirs(OUTPUT_MODEL_DIR, exist_ok=True)

# # Lưu model
# crf.save(os.path.join(OUTPUT_MODEL_DIR, 'crf_model.pkl'))

**SVM (Support Vector Machine)**

In [29]:
print("Training SVM Model")
svm = FlatModelWrapper('svm', config.get('svm', {}))
svm.train(X_train, y_train)

Training SVM Model


In [30]:
# Predict & Evaluate
y_pred_svm = svm.predict(X_test)
evaluate_model(y_test, y_pred_svm, "SVM")


Accuracy: 0.7404
              precision    recall  f1-score   support

      B-AREA     0.5407    0.6983    0.6095       295
      B-ATTR     0.6180    0.5178    0.5635       900
       B-LOC     0.6597    0.6088    0.6332       363
         B-O     0.2105    0.1000    0.1356        40
       B-ORG     0.5714    0.2105    0.3077        38
       B-PER     0.5000    0.3243    0.3934        37
     B-PRICE     0.7594    0.8875    0.8184       160
      B-TYPE     0.6250    0.6533    0.6388       199
      I-AREA     0.7269    0.6094    0.6630       297
      I-ATTR     0.6325    0.5046    0.5614      1296
       I-LOC     0.5753    0.6997    0.6315       393
         I-O     0.1200    0.0484    0.0690        62
       I-ORG     0.4167    0.2174    0.2857        23
       I-PER     0.5882    0.3704    0.4545        27
     I-PRICE     0.7576    0.8789    0.8137       256
      I-TYPE     0.4934    0.5000    0.4967       150

   micro avg     0.6252    0.5769    0.6001      4536
   macro

In [31]:
# OUTPUT_MODEL_DIR = '../outputs/models' 
# os.makedirs(OUTPUT_MODEL_DIR, exist_ok=True)

# # Lưu model
# svm.save(os.path.join(OUTPUT_MODEL_DIR, 'svm_model.pkl'))

**MaxEnt (Maximum Entropy)**

In [32]:
print("Training MaxEnt Model")
maxent = FlatModelWrapper('maxent', config.get('maxent', {}))
maxent.train(X_train, y_train)

Training MaxEnt Model




In [33]:
y_pred_maxent = maxent.predict(X_test)
evaluate_model(y_test, y_pred_maxent, "MaxEnt")


Accuracy: 0.7294
              precision    recall  f1-score   support

      B-AREA     0.6355    0.6441    0.6397       295
      B-ATTR     0.6241    0.4833    0.5448       900
       B-LOC     0.5877    0.6556    0.6198       363
         B-O     0.1176    0.1000    0.1081        40
       B-ORG     0.4000    0.0526    0.0930        38
       B-PER     0.4348    0.2703    0.3333        37
     B-PRICE     0.7803    0.8438    0.8108       160
      B-TYPE     0.6517    0.5829    0.6154       199
      I-AREA     0.7143    0.6061    0.6557       297
      I-ATTR     0.5995    0.5116    0.5520      1296
       I-LOC     0.6077    0.6031    0.6054       393
         I-O     0.2500    0.0806    0.1220        62
       I-ORG     0.6000    0.2609    0.3636        23
       I-PER     0.4375    0.2593    0.3256        27
     I-PRICE     0.7125    0.8711    0.7838       256
      I-TYPE     0.4410    0.4733    0.4566       150

   micro avg     0.6178    0.5560    0.5853      4536
   macro

In [34]:
# OUTPUT_MODEL_DIR = '../outputs/models' 
# os.makedirs(OUTPUT_MODEL_DIR, exist_ok=True)

# # Lưu model
# maxent.save(os.path.join(OUTPUT_MODEL_DIR, 'maxent_model.pkl'))

#pivy tokenize dataset.py
#điều chỉnh tham số config
