**Library**

In [135]:
import sys
import os
import yaml
import json
import pandas as pd
import random
import numpy as np
from sklearn_crfsuite import metrics
from sklearn.metrics import classification_report, accuracy_score

In [136]:
sys.path.append(os.path.abspath(os.path.join('..')))
from src.features.hand_crafted import sent2features, sent2labels, sent2tokens, get_relation_features
from src.models.machine_learning import CRFModel, FlatModelWrapper, RelationExtractionModel
from src.data_loader.dataset import convert_label_studio_to_ner_data, prepare_re_data_from_json
from sklearn.ensemble import RandomForestClassifier
from transformers import AutoTokenizer

**Load Config**

In [137]:
with open('configs/ml_config.yaml', 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)
for model_name in ['svm', 'maxent', 'random_forest']:
    if model_name in config:
        config[model_name]['vectorizer'] = 'phobert'

print("‚úÖ Config updated for PhoBERT vectors!")

‚úÖ Config updated for PhoBERT vectors!


**Load Vector**

In [138]:
VECTOR_DIR = 'data/vectors/'

print("Loading PhoBERT Vectors...")

# 1. Load NER Data (3D Arrays: N_samples x Seq_Len x 768)
X_train_ner_3d = np.load(os.path.join(VECTOR_DIR, 'X_train_ner_phobert.npy'))
y_train_ner_3d = np.load(os.path.join(VECTOR_DIR, 'y_train_ner_phobert.npy'))
X_test_ner_3d  = np.load(os.path.join(VECTOR_DIR, 'X_test_ner_phobert.npy'))
y_test_ner_3d  = np.load(os.path.join(VECTOR_DIR, 'y_test_ner_phobert.npy'))

# 2. Load RE Data (2D Arrays: N_samples x 768)
X_train_re = np.load(os.path.join(VECTOR_DIR, 'X_train_re_phobert.npy'))
y_train_re = np.load(os.path.join(VECTOR_DIR, 'y_train_re_phobert.npy'))
X_test_re  = np.load(os.path.join(VECTOR_DIR, 'X_test_re_phobert.npy'))
y_test_re  = np.load(os.path.join(VECTOR_DIR, 'y_test_re_phobert.npy'))

print(f"NER Train Shape (3D): {X_train_ner_3d.shape}")
print(f"RE Train Shape (2D): {X_train_re.shape}")

Loading PhoBERT Vectors...
NER Train Shape (3D): (797, 256, 768)
RE Train Shape (2D): (277036, 768)


**Load Data**

In [139]:
print("Loading data files...")
TRAIN_PATH = 'data/04_model_input/train_dataset.json'
TEST_PATH = 'data/04_model_input/test_dataset.json'
DEV_PATH = 'data/04_model_input/dev_dataset.json'

with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
    train_json = json.load(f)

with open(TEST_PATH, 'r', encoding='utf-8') as f:
    test_json = json.load(f)

with open(DEV_PATH, 'r', encoding='utf-8') as f:
    dev_json = json.load(f)

print(f"-> Train raw docs: {len(train_json)}")
print(f"-> Test raw docs: {len(test_json)}")
print(f"-> Dev raw docs: {len(dev_json)}")

Loading data files...
-> Train raw docs: 797
-> Test raw docs: 114
-> Dev raw docs: 228


In [140]:
print("\nConverting to BIO format...")
train_sents = convert_label_studio_to_ner_data(train_json)
test_sents = convert_label_studio_to_ner_data(test_json)
dev_sents = convert_label_studio_to_ner_data(dev_json)


Converting to BIO format...


Converting with Pyvi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 797/797 [00:01<00:00, 656.26it/s]
Converting with Pyvi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 114/114 [00:00<00:00, 702.72it/s]
Converting with Pyvi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 228/228 [00:00<00:00, 645.49it/s]


In [141]:
train_sents[0][:20]

[('B√°n', 'O'),
 ('l√¥', 'B-TYPE'),
 ('ƒë·∫•t', 'I-TYPE'),
 ('100m¬≤', 'B-AREA'),
 ('ngay', 'O'),
 ('c·∫°nh', 'O'),
 ('nh√†', 'O'),
 ('thi_ƒë·∫•u', 'B-ATTR'),
 ('Th·ªß_Th·ª´a', 'I-ATTR'),
 (',', 'O'),
 ('view', 'B-ATTR'),
 ('h·ªì', 'I-ATTR'),
 ('sinh_th√°i', 'I-ATTR'),
 ('m√°t_m·∫ª', 'I-ATTR'),
 (',', 'O'),
 ('gi√°', 'O'),
 ('m·ªÅm', 'O'),
 (',', 'O'),
 ('x√¢y_d·ª±ng', 'O'),
 ('·ªü', 'O')]

**Extracting features**

In [142]:
print("\nExtracting features")
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

print(f"-> X_train shape: {len(X_train)} sentences")
print(f"-> X_test shape: {len(X_test)} sentences")


Extracting features
-> X_train shape: 797 sentences
-> X_test shape: 114 sentences


In [143]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
all_labels = sorted(list(set([label for sent in train_sents for token, label in sent])))
id2label = {i: label for i, label in enumerate(all_labels)}
print("Label Map:", id2label)

Label Map: {0: 'B-AREA', 1: 'B-ATTR', 2: 'B-LOC', 3: 'B-O', 4: 'B-ORG', 5: 'B-PER', 6: 'B-PRICE', 7: 'B-TYPE', 8: 'I-AREA', 9: 'I-ATTR', 10: 'I-LOC', 11: 'I-O', 12: 'I-ORG', 13: 'I-PER', 14: 'I-PRICE', 15: 'I-TYPE', 16: 'O'}


In [144]:
def decode_tags(y_ids_list, id2label):
    decoded_list = []
    for sent in y_ids_list:
        sent_decoded = [id2label[int(i)] for i in sent]
        decoded_list.append(sent_decoded)
    return decoded_list

In [145]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n{'='*20} EVALUATION: {model_name} {'='*20}")
    
    # L·∫•y danh s√°ch nh√£n th·ª±c t·∫ø (tr·ª´ O ƒë·ªÉ F1-score ph·∫£n √°nh ƒë√∫ng th·ª±c th·ªÉ)
    labels = list(set([l for sent in y_true for l in sent]))
    if 'O' in labels: labels.remove('O')
    labels = sorted(labels)
    
    print(f"Accuracy: {metrics.flat_accuracy_score(y_true, y_pred):.4f}")
    print(metrics.flat_classification_report(
        y_true, y_pred, labels=labels, digits=4
    ))

### **NER**

**CRF (Conditional Random Fields)**

In [13]:
def prepare_data_for_crf_phobert(X_3d, y_3d, id2label):
    X_crf = []
    y_crf = []
    
    print("Converting PhoBERT vectors to CRF features")
    
    for i in range(len(y_3d)):
        vectors = X_3d[i]
        labels = y_3d[i]

        valid_mask = labels != -100
        
        if not np.any(valid_mask): continue
            
        sent_vectors = vectors[valid_mask]
        sent_labels = labels[valid_mask]
        
        sent_features = []
        for vec in sent_vectors:
            feat = {f'd_{idx}': float(val) for idx, val in enumerate(vec)}
            
            feat['bias'] = 1.0
            sent_features.append(feat)
            
        sent_tags = [id2label[int(l)] for l in sent_labels]
        
        X_crf.append(sent_features)
        y_crf.append(sent_tags)
        
    return X_crf, y_crf

In [None]:
print("\nProcessing Data for CRF...")
X_train_crf, y_train_crf = prepare_data_for_crf_phobert(X_train_ner_3d, y_train_ner_3d, id2label)
X_test_crf, y_test_crf   = prepare_data_for_crf_phobert(X_test_ner_3d, y_test_ner_3d, id2label)
print(f"-> Train samples: {len(X_train_crf)}")
print(f"-> Test samples: {len(X_test_crf)}")

In [148]:
print("Training CRF model")
crf = CRFModel(config.get('crf', {}))
crf.train(X_train_crf, y_train_crf)

Training CRF model


In [149]:
# Predict & Evaluate
y_pred_crf = crf.predict(X_test_crf)
evaluate_model(y_test_crf, y_pred_crf, "CRF")


Accuracy: 0.7500
              precision    recall  f1-score   support

      B-AREA     0.6327    0.6237    0.6282       279
      B-ATTR     0.7045    0.5701    0.6302       870
       B-LOC     0.7219    0.6816    0.7011       358
         B-O     0.0556    0.0250    0.0345        40
       B-ORG     0.6471    0.3667    0.4681        30
       B-PER     0.4667    0.4000    0.4308        35
     B-PRICE     0.8540    0.8182    0.8357       143
      B-TYPE     0.6928    0.6150    0.6516       187
      I-AREA     0.7976    0.8100    0.8038       579
      I-ATTR     0.6543    0.5997    0.6259      1594
       I-LOC     0.6713    0.6544    0.6627       596
         I-O     0.1429    0.0405    0.0632        74
       I-ORG     0.5833    0.6087    0.5957        46
       I-PER     0.5385    0.3500    0.4242        40
     I-PRICE     0.9079    0.7962    0.8484       260
      I-TYPE     0.3636    0.4533    0.4036       150

   micro avg     0.6854    0.6262    0.6545      5281
   macro

In [150]:
def process_ner_vectors(X_3d, y_3d):
    X_list = []
    y_list = []
    
    for i in range(len(y_3d)):
        # L·∫•y nh√£n c·ªßa c√¢u th·ª© i
        labels = y_3d[i]
        vectors = X_3d[i]
        
        valid_indices = labels != -100     
        X_sent = vectors[valid_indices]
        y_sent = labels[valid_indices]
        
        if len(y_sent) > 0:
            X_list.append(X_sent)
            y_list.append(y_sent)
            
    return X_list, y_list

print("Processing NER vectors (Removing padding)...")
X_train_ner, y_train_ner = process_ner_vectors(X_train_ner_3d, y_train_ner_3d)
X_test_ner, y_test_ner = process_ner_vectors(X_test_ner_3d, y_test_ner_3d)

print(f"-> NER Train sentences: {len(X_train_ner)}")
print(f"-> Sample sentence length: {len(X_train_ner[0])}")

Processing NER vectors (Removing padding)...
-> NER Train sentences: 797
-> Sample sentence length: 38


**SVM (Support Vector Machine)**

In [151]:
print("Training SVM Model")
svm = FlatModelWrapper('svm', config.get('svm', {}))
svm.train(X_train_ner, y_train_ner)

Training SVM Model


In [152]:
# Predict & Evaluate
y_pred_svm = svm.predict(X_test_ner)
y_test_tags = decode_tags(y_test_ner, id2label)
y_pred_tags = decode_tags(y_pred_svm, id2label)
evaluate_model(y_test_tags, y_pred_tags, "SVM")


Accuracy: 0.7444
              precision    recall  f1-score   support

      B-AREA     0.6455    0.4373    0.5214       279
      B-ATTR     0.7173    0.4287    0.5367       870
       B-LOC     0.6544    0.7458    0.6971       358
         B-O     0.0000    0.0000    0.0000        40
       B-ORG     0.6667    0.1333    0.2222        30
       B-PER     0.6154    0.4571    0.5246        35
     B-PRICE     0.8681    0.8741    0.8711       143
      B-TYPE     0.6244    0.7380    0.6765       187
      I-AREA     0.8045    0.8031    0.8038       579
      I-ATTR     0.6059    0.5885    0.5971      1594
       I-LOC     0.7323    0.6426    0.6845       596
         I-O     0.0000    0.0000    0.0000        74
       I-ORG     0.4737    0.3913    0.4286        46
       I-PER     0.6000    0.3750    0.4615        40
     I-PRICE     0.9437    0.8385    0.8880       260
      I-TYPE     0.5368    0.3400    0.4163       150

   micro avg     0.6842    0.5933    0.6355      5281
   macro

**MaxEnt (Maximum Entropy)**

In [153]:
print("Training MaxEnt Model")
maxent = FlatModelWrapper('maxent', config.get('maxent', {}))
maxent.train(X_train_ner, y_train_ner)

Training MaxEnt Model




In [154]:
y_pred_maxent = maxent.predict(X_test_ner)
y_test_tags = decode_tags(y_test_ner, id2label)
y_pred_tags = decode_tags(y_pred_maxent, id2label)
evaluate_model(y_test_tags, y_pred_tags, "MaxEnt")


Accuracy: 0.7544
              precision    recall  f1-score   support

      B-AREA     0.6512    0.6022    0.6257       279
      B-ATTR     0.7064    0.5862    0.6407       870
       B-LOC     0.6789    0.7207    0.6992       358
         B-O     0.0000    0.0000    0.0000        40
       B-ORG     0.5333    0.2667    0.3556        30
       B-PER     0.5000    0.4571    0.4776        35
     B-PRICE     0.8652    0.8531    0.8592       143
      B-TYPE     0.6875    0.6471    0.6667       187
      I-AREA     0.8229    0.8187    0.8208       579
      I-ATTR     0.6652    0.5571    0.6064      1594
       I-LOC     0.7038    0.6896    0.6966       596
         I-O     0.0938    0.0405    0.0566        74
       I-ORG     0.5000    0.5217    0.5106        46
       I-PER     0.5862    0.4250    0.4928        40
     I-PRICE     0.9106    0.8615    0.8854       260
      I-TYPE     0.4452    0.4600    0.4525       150

   micro avg     0.6972    0.6273    0.6604      5281
   macro

In [None]:
def visualize_ner_predictions_ml(test_json, y_true_list, y_pred_list, id2label, tokenizer, num_samples=2):
    indices = random.sample(range(len(y_pred_list)), min(num_samples, len(y_pred_list)))
    
    for idx in indices:
        print(f"\nSentence #{idx}:")
        
        raw_text = test_json[idx]['data']['text']
        tokens = tokenizer.tokenize(raw_text)
        
        true_seq = y_true_list[idx]
        pred_seq = y_pred_list[idx]

        min_len = min(len(tokens), len(true_seq), len(pred_seq))
        tokens_show = tokens[:min_len]
        
        true_labels = []
        for x in true_seq[:min_len]:
            if isinstance(x, (int, np.integer, float, np.float64)):
                true_labels.append(id2label[int(x)])
            else:
                true_labels.append(str(x))
                
        # X·ª≠ l√Ω Pred Label
        pred_labels = []
        for x in pred_seq[:min_len]:
            if isinstance(x, (int, np.integer, float, np.float64)):
                pred_labels.append(id2label[int(x)])
            else:
                pred_labels.append(str(x))
        results = ['‚úÖ' if t == p else '‚ùå' for t, p in zip(true_labels, pred_labels)]
        
        correct = sum(1 for r in results if r == '‚úÖ')
        print(f"-> Text: {raw_text[:100]}...") 
        print(f"-> Correct: {correct}/{len(results)} ({correct/len(results):.1%})")
        
        # 4. T·∫°o DataFrame
        df = pd.DataFrame({
            'Subword': tokens_show,
            'Th·ª±c t·∫ø (True)': true_labels,
            'D·ª± ƒëo√°n (Pred)': pred_labels,
            'K·∫øt qu·∫£': results
        })
        display(df)

In [157]:
print("\n=== K·∫æT QU·∫¢ D·ª∞ ƒêO√ÅN: CRF ===")
y_pred_crf = crf.predict(X_test_crf)
visualize_ner_predictions_ml(
    test_json,
    y_test_tags,
    y_pred_crf,
    id2label,
    tokenizer,
    num_samples=1
)


=== K·∫æT QU·∫¢ D·ª∞ ƒêO√ÅN: CRF ===

üìå Sentence #86:
-> Text: L√¥ g√≥c 2 m·∫∑t ti·ªÅn KQH H·∫£i D∆∞∆°ng, TP Hu·∫ø Di·ªán t√≠ch: 175m¬≤ (ngang 11,8m) ƒê∆∞·ªùng quy ho·∫°ch: 10m 5m H∆∞·ªõng...
-> Correct: 58/77 (75.3%)


Unnamed: 0,Subword,Th·ª±c t·∫ø (True),D·ª± ƒëo√°n (Pred),K·∫øt qu·∫£
0,L√¥,B-TYPE,B-TYPE,‚úÖ
1,g√≥c,O,I-TYPE,‚ùå
2,2,B-ATTR,B-ATTR,‚úÖ
3,m·∫∑t,I-ATTR,I-ATTR,‚úÖ
4,ti·ªÅn,B-LOC,B-LOC,‚úÖ
...,...,...,...,...
72,tho√°ng,B-PRICE,B-PRICE,‚úÖ
73,ƒë·∫π@@,I-PRICE,I-PRICE,‚úÖ
74,p@@,I-PRICE,I-PRICE,‚úÖ
75,",",B-PER,B-LOC,‚ùå


In [159]:
print("\n=== K·∫æT QU·∫¢ D·ª∞ ƒêO√ÅN: SVM ===")
y_pred_svm = svm.predict(X_test_ner)
visualize_ner_predictions_ml(
    test_json,
    y_test_tags,
    y_pred_svm,
    id2label,
    tokenizer,
    num_samples=1
)


=== K·∫æT QU·∫¢ D·ª∞ ƒêO√ÅN: SVM ===

üìå Sentence #96:
-> Text: C·∫ßn b√°n n·ªÅn MT ƒë∆∞·ªùng Nguy·ªÖn ƒê·ªá, ph∆∞·ªùng C√°i Kh·∫ø Full th·ªï c∆∞ Di·ªán t√≠ch: 5m x 30m H∆∞·ªõng: T√¢y B·∫Øc L·ªô gi·ªõ...
-> Correct: 46/53 (86.8%)


Unnamed: 0,Subword,Th·ª±c t·∫ø (True),D·ª± ƒëo√°n (Pred),K·∫øt qu·∫£
0,C·∫ßn,O,O,‚úÖ
1,b√°n,O,O,‚úÖ
2,n·ªÅn,B-TYPE,B-TYPE,‚úÖ
3,MT,B-ATTR,B-ATTR,‚úÖ
4,ƒë∆∞·ªùng,B-LOC,B-LOC,‚úÖ
5,Nguy·ªÖn,I-LOC,I-LOC,‚úÖ
6,ƒê@@,I-LOC,I-LOC,‚úÖ
7,·ªá@@,I-LOC,O,‚ùå
8,",",B-LOC,B-LOC,‚úÖ
9,ph∆∞·ªùng,I-LOC,I-LOC,‚úÖ


In [160]:
print("\n=== K·∫æT QU·∫¢ D·ª∞ ƒêO√ÅN: MAXENT ===")
y_pred_maxent = maxent.predict(X_test_ner)
visualize_ner_predictions_ml(
    test_json,
    y_test_tags,
    y_pred_maxent,
    id2label,
    tokenizer,
    num_samples=1
)


=== K·∫æT QU·∫¢ D·ª∞ ƒêO√ÅN: MAXENT ===

üìå Sentence #6:
-> Text: Ch√≠nh ch·ªß b√°n ƒë·∫•t m·∫∑t ƒë∆∞·ªùng Xu√¢n Th·ªßy ƒëi c·ª≠a kh·∫©u Thanh Th·ªßy, ƒë∆∞·ªùng ƒëang m·ªü r·ªông: M·∫∑t ti·ªÅn 15 m, s√¢u...
-> Correct: 41/52 (78.8%)


Unnamed: 0,Subword,Th·ª±c t·∫ø (True),D·ª± ƒëo√°n (Pred),K·∫øt qu·∫£
0,Ch√≠nh,B-PER,B-PER,‚úÖ
1,ch·ªß,I-PER,I-PER,‚úÖ
2,b√°n,O,O,‚úÖ
3,ƒë·∫•t,B-TYPE,B-TYPE,‚úÖ
4,m·∫∑t,B-LOC,B-LOC,‚úÖ
5,ƒë∆∞·ªùng,B-LOC,I-LOC,‚ùå
6,Xu√¢n,I-LOC,I-LOC,‚úÖ
7,Th·ªß@@,I-LOC,I-ATTR,‚ùå
8,y,I-LOC,I-LOC,‚úÖ
9,ƒëi,B-LOC,B-LOC,‚úÖ


### **RE**

In [161]:
print("Creating pairs from Training Data...")
train_pairs = prepare_re_data_from_json(train_json)
test_pairs = prepare_re_data_from_json(test_json)

print(f"-> Total Train Pairs: {len(train_pairs)}")
print(f"-> Total Test Pairs: {len(test_pairs)}")

all_labels = sorted(list(set([p['label'] for p in train_pairs])))
re_id2label = {i: label for i, label in enumerate(all_labels)}
print("Label Mapping:", re_id2label)

Creating pairs from Training Data...
-> Total Train Pairs: 277036
-> Total Test Pairs: 47950
Label Mapping: {0: 'HAS_AREA', 1: 'HAS_ATTR', 2: 'HAS_PRICE', 3: 'LOCATED_AT', 4: 'NO_RELATION'}


In [162]:
print("Training SVM for RE")
svm = RelationExtractionModel('svm', config.get('svm', {}))
svm.train(X_train_re, y_train_re)

Training SVM for RE
Model SVM: Using Pre-computed PhoBERT Vectors


In [163]:
print("\n--- SVM Evaluation ---")
y_pred_svm = svm.predict(X_test_re)
acc = accuracy_score(y_test_re, y_pred_svm)
class_ids = list(svm.model.classes_)

no_rel_id = None
for idx, label in re_id2label.items():
    if label == 'NO_RELATION':
        no_rel_id = idx
        break
if no_rel_id is not None and no_rel_id in class_ids:
    report_ids = [i for i in class_ids if i != no_rel_id]
else:
    report_ids = class_ids

target_names = [re_id2label[i] for i in report_ids]

print(f"Accuracy: {acc:.4f} ({acc*100:.2f}%)")
print(classification_report(
    y_test_re, 
    y_pred_svm, 
    labels=report_ids, 
    target_names=target_names, 
    digits=4
))


--- SVM Evaluation ---
Accuracy: 0.9658 (96.58%)
              precision    recall  f1-score   support

    HAS_AREA     0.0000    0.0000    0.0000       273
    HAS_ATTR     0.4167    0.0054    0.0107       919
   HAS_PRICE     0.0000    0.0000    0.0000       146
  LOCATED_AT     0.0000    0.0000    0.0000       297

   micro avg     0.3125    0.0031    0.0061      1635
   macro avg     0.1042    0.0014    0.0027      1635
weighted avg     0.2342    0.0031    0.0060      1635



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [164]:
print("Training Maxent for RE")
maxent = RelationExtractionModel('maxent', config.get('maxent', {}))
maxent.train(X_train_re, y_train_re)

Training Maxent for RE
Model MAXENT: Using Pre-computed PhoBERT Vectors




In [165]:
print("\n--- Maxent Evaluation ---")
y_pred_maxent = maxent.predict(X_test_re)
acc = accuracy_score(y_test_re, y_pred_maxent)

class_ids = list(maxent.model.classes_)

no_rel_id = None
for idx, label in re_id2label.items():
    if label == 'NO_RELATION':
        no_rel_id = idx
        break
if no_rel_id is not None and no_rel_id in class_ids:
    report_ids = [i for i in class_ids if i != no_rel_id]
else:
    report_ids = class_ids

target_names = [re_id2label[i] for i in report_ids]

print(f"Accuracy: {acc:.4f} ({acc*100:.2f}%)")
print(classification_report(
    y_test_re, 
    y_pred_svm, 
    labels=report_ids, 
    target_names=target_names, 
    digits=4
))


--- Maxent Evaluation ---
Accuracy: 0.9319 (93.19%)
              precision    recall  f1-score   support

    HAS_AREA     0.0000    0.0000    0.0000       273
    HAS_ATTR     0.4167    0.0054    0.0107       919
   HAS_PRICE     0.0000    0.0000    0.0000       146
  LOCATED_AT     0.0000    0.0000    0.0000       297

   micro avg     0.3125    0.0031    0.0061      1635
   macro avg     0.1042    0.0014    0.0027      1635
weighted avg     0.2342    0.0031    0.0060      1635



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [166]:
print("\nTraining Random Forest for RE")
rf = RelationExtractionModel('random_forest', config.get('random_forest', {}))
rf.train(X_train_re, y_train_re)


Training Random Forest for RE
Model RANDOM_FOREST: Using Pre-computed PhoBERT Vectors


In [167]:
print("\n--- Random Forest Evaluation ---")
y_pred_rf = rf.predict(X_test_re)
acc = accuracy_score(y_test_re, y_pred_rf)

class_ids = list(rf.model.classes_)

no_rel_id = None
for idx, label in re_id2label.items():
    if label == 'NO_RELATION':
        no_rel_id = idx
        break
if no_rel_id is not None and no_rel_id in class_ids:
    report_ids = [i for i in class_ids if i != no_rel_id]
else:
    report_ids = class_ids

target_names = [re_id2label[i] for i in report_ids]

print(f"Accuracy: {acc:.4f} ({acc*100:.2f}%)")
print(classification_report(
    y_test_re, 
    y_pred_svm, 
    labels=report_ids, 
    target_names=target_names, 
    digits=4
))


--- Random Forest Evaluation ---
Accuracy: 0.9659 (96.59%)
              precision    recall  f1-score   support

    HAS_AREA     0.0000    0.0000    0.0000       273
    HAS_ATTR     0.4167    0.0054    0.0107       919
   HAS_PRICE     0.0000    0.0000    0.0000       146
  LOCATED_AT     0.0000    0.0000    0.0000       297

   micro avg     0.3125    0.0031    0.0061      1635
   macro avg     0.1042    0.0014    0.0027      1635
weighted avg     0.2342    0.0031    0.0060      1635



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [168]:
def visualize_re_model_comparison(test_pairs, y_true_ids, preds_dict, id2label, num_samples=10):
    print(f"\n{'='*30} RE MODEL COMPARISON {'='*30}")
    
    # Ch·ªçn index ng·∫´u nhi√™n
    indices = random.sample(range(len(test_pairs)), min(num_samples, len(test_pairs)))
    
    data = []
    for idx in indices:
        pair = test_pairs[idx]
        ent1 = pair['ent1']['text']
        ent2 = pair['ent2']['text']
        
        # L·∫•y nh√£n th·ª±c t·∫ø (Map t·ª´ ID sang Ch·ªØ)
        true_id = y_true_ids[idx]
        true_label = id2label[true_id]
        
        row = {
            'C·∫∑p th·ª±c th·ªÉ': f"{ent1} ‚û° {ent2}",
            'Th·ª±c t·∫ø': true_label
        }
        
        # Duy·ªát qua c√°c model
        for model_name, y_pred in preds_dict.items():
            pred_id = y_pred[idx]
            pred_label = id2label[pred_id]
            
            status = "‚úÖ" if pred_id == true_id else "‚ùå"
            row[model_name] = f"{pred_label} {status}"
            
        data.append(row)
    
    df = pd.DataFrame(data)
    display(df)

In [170]:
preds_dict = {
    'SVM': y_pred_svm,
    'MaxEnt': y_pred_maxent,
    'RandomForest': y_pred_rf
}

visualize_re_model_comparison(test_pairs, y_test_re, preds_dict, re_id2label, num_samples=15)




Unnamed: 0,C·∫∑p th·ª±c th·ªÉ,Th·ª±c t·∫ø,SVM,MaxEnt,RandomForest
0,g·∫ßn b·ªánh vi·ªán ‚û° g·∫ßn ch·ª£,NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
1,"94,97m¬≤ ‚û° 6, x t·ª∑",NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
2,l√†m vƒÉn ph√≤ng ‚û° c√¥ng nh·∫≠n 145m2 145m2,NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
3,CƒÉn h·ªô ‚û° gym,HAS_ATTR,NO_RELATION ‚ùå,NO_RELATION ‚ùå,NO_RELATION ‚ùå
4,"S1416 ‚û° 82,33 m¬≤",NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
5,Ch·ªß nh√† ‚û° an to√†n,NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
6,tr·∫°m ‚û° x√¢y nh√† ·ªü,NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
7,Vinhomes Central Park ‚û° 188m2,NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
8,"S1803 ‚û° 4, x t·ª∑",NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
9,1 tr·ªát 2 l·∫ßu ‚û° ƒê∆∞·ªùng nh·ª±a 8m,NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
