**Library**

In [191]:
import sys
import os
import yaml
import json
import pandas as pd
from sklearn_crfsuite import metrics

In [192]:
sys.path.append(os.path.abspath(os.path.join('..')))
from src.features.hand_crafted import sent2features, sent2labels, sent2tokens
from src.models.machine_learning import CRFModel, FlatModelWrapper
from src.data_loader.dataset import convert_label_studio_to_ner_data

**Load Config**

In [193]:
with open('../configs/ml_config.yaml', 'r') as f:
    config = yaml.safe_load(f)

**Load Data**

In [194]:
print("Loading data files...")
TRAIN_PATH = '../data/04_model_input/train_dataset.json'
TEST_PATH = '../data/04_model_input/test_dataset.json'
DEV_PATH = '../data/04_model_input/dev_dataset.json'

with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
    train_json = json.load(f)

with open(TEST_PATH, 'r', encoding='utf-8') as f:
    test_json = json.load(f)

with open(DEV_PATH, 'r', encoding='utf-8') as f:
    dev_json = json.load(f)

print(f"-> Train raw docs: {len(train_json)}")
print(f"-> Test raw docs: {len(test_json)}")
print(f"-> Dev raw docs: {len(dev_json)}")

Loading data files...
-> Train raw docs: 797
-> Test raw docs: 114
-> Dev raw docs: 228


In [195]:
print("\nConverting to BIO format...")
train_sents = convert_label_studio_to_ner_data(train_json)
test_sents = convert_label_studio_to_ner_data(test_json)
dev_sents = convert_label_studio_to_ner_data(dev_json)


Converting to BIO format...


Converting with Pyvi: 100%|██████████| 797/797 [00:01<00:00, 694.53it/s]
Converting with Pyvi: 100%|██████████| 114/114 [00:00<00:00, 790.89it/s]
Converting with Pyvi: 100%|██████████| 228/228 [00:00<00:00, 745.98it/s]


In [196]:
train_sents[0][:20]

[('Bán', 'O'),
 ('lô', 'B-TYPE'),
 ('đất', 'I-TYPE'),
 ('100m²', 'B-AREA'),
 ('ngay', 'O'),
 ('cạnh', 'O'),
 ('nhà', 'O'),
 ('thi_đấu', 'B-ATTR'),
 ('Thủ_Thừa', 'I-ATTR'),
 (',', 'O'),
 ('view', 'B-ATTR'),
 ('hồ', 'I-ATTR'),
 ('sinh_thái', 'I-ATTR'),
 ('mát_mẻ', 'I-ATTR'),
 (',', 'O'),
 ('giá', 'O'),
 ('mềm', 'O'),
 (',', 'O'),
 ('xây_dựng', 'O'),
 ('ở', 'O')]

**Extracting features**

In [197]:
print("\nExtracting features")
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

print(f"-> X_train shape: {len(X_train)} sentences")
print(f"-> X_test shape: {len(X_test)} sentences")


Extracting features
-> X_train shape: 797 sentences
-> X_test shape: 114 sentences


In [198]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n{'='*20} EVALUATION: {model_name} {'='*20}")
    
    # Lấy danh sách nhãn thực tế (trừ O để F1-score phản ánh đúng thực thể)
    labels = list(set([l for sent in y_true for l in sent]))
    if 'O' in labels: labels.remove('O')
    labels = sorted(labels)
    
    print(f"Accuracy: {metrics.flat_accuracy_score(y_true, y_pred):.4f}")
    print(metrics.flat_classification_report(
        y_true, y_pred, labels=labels, digits=4
    ))

**CRF (Conditional Random Fields)**

In [199]:
print("Training CRF model")
crf = CRFModel(config.get('crf', {}))
crf.train(X_train, y_train)

Training CRF model


In [200]:
# Predict & Evaluate
y_pred_crf = crf.predict(X_test)
evaluate_model(y_test, y_pred_crf, "CRF")


Accuracy: 0.7218
              precision    recall  f1-score   support

      B-AREA     0.6285    0.6136    0.6209       295
      B-ATTR     0.5923    0.4811    0.5310       900
       B-LOC     0.6000    0.6446    0.6215       363
         B-O     0.1515    0.1250    0.1370        40
       B-ORG     0.7500    0.1579    0.2609        38
       B-PER     0.4348    0.2703    0.3333        37
     B-PRICE     0.7803    0.8438    0.8108       160
      B-TYPE     0.6491    0.5578    0.6000       199
      I-AREA     0.7094    0.6330    0.6690       297
      I-ATTR     0.5654    0.5139    0.5384      1296
       I-LOC     0.5877    0.6310    0.6086       393
         I-O     0.1515    0.0806    0.1053        62
       I-ORG     0.7619    0.6957    0.7273        23
       I-PER     0.3750    0.2222    0.2791        27
     I-PRICE     0.7197    0.8828    0.7930       256
      I-TYPE     0.3812    0.4067    0.3935       150

   micro avg     0.5989    0.5580    0.5777      4536
   macro

In [201]:
# OUTPUT_MODEL_DIR = '../outputs/models' 
# os.makedirs(OUTPUT_MODEL_DIR, exist_ok=True)

# # Lưu model
# crf.save(os.path.join(OUTPUT_MODEL_DIR, 'crf_model.pkl'))

**SVM (Support Vector Machine)**

In [202]:
print("Training SVM Model")
svm = FlatModelWrapper('svm', config.get('svm', {}))
svm.train(X_train, y_train)

Training SVM Model


In [203]:
# Predict & Evaluate
y_pred_svm = svm.predict(X_test)
evaluate_model(y_test, y_pred_svm, "SVM")


Accuracy: 0.7450
              precision    recall  f1-score   support

      B-AREA     0.6349    0.6542    0.6444       295
      B-ATTR     0.5901    0.5311    0.5591       900
       B-LOC     0.5922    0.7080    0.6449       363
         B-O     0.2000    0.1000    0.1333        40
       B-ORG     0.6667    0.1579    0.2553        38
       B-PER     0.3750    0.2432    0.2951        37
     B-PRICE     0.8155    0.8562    0.8354       160
      B-TYPE     0.6220    0.6533    0.6373       199
      I-AREA     0.7092    0.5993    0.6496       297
      I-ATTR     0.6576    0.4861    0.5590      1296
       I-LOC     0.6217    0.6692    0.6446       393
         I-O     0.1200    0.0484    0.0690        62
       I-ORG     0.6000    0.2609    0.3636        23
       I-PER     0.4706    0.2963    0.3636        27
     I-PRICE     0.7389    0.9062    0.8140       256
      I-TYPE     0.5033    0.5067    0.5050       150

   micro avg     0.6324    0.5754    0.6026      4536
   macro

In [204]:
# OUTPUT_MODEL_DIR = '../outputs/models' 
# os.makedirs(OUTPUT_MODEL_DIR, exist_ok=True)

# # Lưu model
# svm.save(os.path.join(OUTPUT_MODEL_DIR, 'svm_model.pkl'))

**MaxEnt (Maximum Entropy)**

In [205]:
print("Training MaxEnt Model")
maxent = FlatModelWrapper('maxent', config.get('maxent', {}))
maxent.train(X_train, y_train)

Training MaxEnt Model




In [206]:
y_pred_maxent = maxent.predict(X_test)
evaluate_model(y_test, y_pred_maxent, "MaxEnt")


Accuracy: 0.7309
              precision    recall  f1-score   support

      B-AREA     0.6141    0.6475    0.6304       295
      B-ATTR     0.6230    0.4756    0.5394       900
       B-LOC     0.5966    0.6722    0.6321       363
         B-O     0.1212    0.1000    0.1096        40
       B-ORG     0.6000    0.0789    0.1395        38
       B-PER     0.5000    0.3514    0.4127        37
     B-PRICE     0.7714    0.8438    0.8060       160
      B-TYPE     0.6484    0.5930    0.6194       199
      I-AREA     0.7092    0.5993    0.6496       297
      I-ATTR     0.6058    0.5123    0.5552      1296
       I-LOC     0.6101    0.6132    0.6117       393
         I-O     0.2632    0.0806    0.1235        62
       I-ORG     0.6667    0.2609    0.3750        23
       I-PER     0.4667    0.2593    0.3333        27
     I-PRICE     0.7157    0.8750    0.7873       256
      I-TYPE     0.4383    0.4733    0.4551       150

   micro avg     0.6194    0.5582    0.5872      4536
   macro

In [207]:
# OUTPUT_MODEL_DIR = '../outputs/models' 
# os.makedirs(OUTPUT_MODEL_DIR, exist_ok=True)

# # Lưu model
# maxent.save(os.path.join(OUTPUT_MODEL_DIR, 'maxent_model.pkl'))