**Library**

In [33]:
import sys
import os
import yaml
import json
import pandas as pd
import random
import numpy as np
from sklearn_crfsuite import metrics
from sklearn.metrics import classification_report, accuracy_score

In [34]:
sys.path.append(os.path.abspath(os.path.join('..')))
from src.features.hand_crafted import sent2features, sent2labels, sent2tokens, get_relation_features
from src.models.machine_learning import CRFModel, FlatModelWrapper, RelationExtractionModel
from src.data_loader.dataset import convert_label_studio_to_ner_data, prepare_re_data_from_json
from sklearn.ensemble import RandomForestClassifier
from transformers import AutoTokenizer

**Load Config**

In [35]:
with open('configs/ml_config.yaml', 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)
for model_name in ['svm', 'maxent', 'random_forest']:
    if model_name in config:
        config[model_name]['vectorizer'] = 'phobert'

print("‚úÖ Config updated for PhoBERT vectors!")

‚úÖ Config updated for PhoBERT vectors!


**Load Vector**

In [36]:
VECTOR_DIR = 'data/vectors/'

print("Loading PhoBERT Vectors...")

# 1. Load NER Data (3D Arrays: N_samples x Seq_Len x 768)
X_train_ner_3d = np.load(os.path.join(VECTOR_DIR, 'X_train_ner_phobert.npy'))
y_train_ner_3d = np.load(os.path.join(VECTOR_DIR, 'y_train_ner_phobert.npy'))
X_test_ner_3d  = np.load(os.path.join(VECTOR_DIR, 'X_test_ner_phobert.npy'))
y_test_ner_3d  = np.load(os.path.join(VECTOR_DIR, 'y_test_ner_phobert.npy'))

# 2. Load RE Data (2D Arrays: N_samples x 768)
X_train_re = np.load(os.path.join(VECTOR_DIR, 'X_train_re_phobert.npy'))
y_train_re = np.load(os.path.join(VECTOR_DIR, 'y_train_re_phobert.npy'))
X_test_re  = np.load(os.path.join(VECTOR_DIR, 'X_test_re_phobert.npy'))
y_test_re  = np.load(os.path.join(VECTOR_DIR, 'y_test_re_phobert.npy'))

print(f"NER Train Shape (3D): {X_train_ner_3d.shape}")
print(f"RE Train Shape (2D): {X_train_re.shape}")

Loading PhoBERT Vectors...
NER Train Shape (3D): (797, 256, 768)
RE Train Shape (2D): (277036, 768)


**Load Data**

In [37]:
print("Loading data files...")
TRAIN_PATH = 'data/04_model_input/train_dataset.json'
TEST_PATH = 'data/04_model_input/test_dataset.json'
DEV_PATH = 'data/04_model_input/dev_dataset.json'

with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
    train_json = json.load(f)

with open(TEST_PATH, 'r', encoding='utf-8') as f:
    test_json = json.load(f)

with open(DEV_PATH, 'r', encoding='utf-8') as f:
    dev_json = json.load(f)

print(f"-> Train raw docs: {len(train_json)}")
print(f"-> Test raw docs: {len(test_json)}")
print(f"-> Dev raw docs: {len(dev_json)}")

Loading data files...
-> Train raw docs: 797
-> Test raw docs: 114
-> Dev raw docs: 228


In [38]:
print("\nConverting to BIO format...")
train_sents = convert_label_studio_to_ner_data(train_json)
test_sents = convert_label_studio_to_ner_data(test_json)
dev_sents = convert_label_studio_to_ner_data(dev_json)


Converting to BIO format...


Converting with Pyvi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 797/797 [00:01<00:00, 724.35it/s]
Converting with Pyvi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 114/114 [00:00<00:00, 681.76it/s]
Converting with Pyvi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 228/228 [00:00<00:00, 708.66it/s]


In [39]:
train_sents[0][:20]

[('B√°n', 'O'),
 ('l√¥', 'B-TYPE'),
 ('ƒë·∫•t', 'I-TYPE'),
 ('100m¬≤', 'B-AREA'),
 ('ngay', 'O'),
 ('c·∫°nh', 'O'),
 ('nh√†', 'O'),
 ('thi_ƒë·∫•u', 'B-ATTR'),
 ('Th·ªß_Th·ª´a', 'I-ATTR'),
 (',', 'O'),
 ('view', 'B-ATTR'),
 ('h·ªì', 'I-ATTR'),
 ('sinh_th√°i', 'I-ATTR'),
 ('m√°t_m·∫ª', 'I-ATTR'),
 (',', 'O'),
 ('gi√°', 'O'),
 ('m·ªÅm', 'O'),
 (',', 'O'),
 ('x√¢y_d·ª±ng', 'O'),
 ('·ªü', 'O')]

**Extracting features**

In [40]:
print("\nExtracting features")
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

print(f"-> X_train shape: {len(X_train)} sentences")
print(f"-> X_test shape: {len(X_test)} sentences")


Extracting features
-> X_train shape: 797 sentences
-> X_test shape: 114 sentences


In [41]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
all_labels = sorted(list(set([label for sent in train_sents for token, label in sent])))
id2label = {i: label for i, label in enumerate(all_labels)}
print("Label Map:", id2label)

Label Map: {0: 'B-AREA', 1: 'B-ATTR', 2: 'B-LOC', 3: 'B-O', 4: 'B-ORG', 5: 'B-PER', 6: 'B-PRICE', 7: 'B-TYPE', 8: 'I-AREA', 9: 'I-ATTR', 10: 'I-LOC', 11: 'I-O', 12: 'I-ORG', 13: 'I-PER', 14: 'I-PRICE', 15: 'I-TYPE', 16: 'O'}


In [42]:
def decode_tags(y_ids_list, id2label):
    decoded_list = []
    for sent in y_ids_list:
        sent_decoded = [id2label[int(i)] for i in sent]
        decoded_list.append(sent_decoded)
    return decoded_list

In [43]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n{'='*20} EVALUATION: {model_name} {'='*20}")
    
    # L·∫•y danh s√°ch nh√£n th·ª±c t·∫ø (tr·ª´ O ƒë·ªÉ F1-score ph·∫£n √°nh ƒë√∫ng th·ª±c th·ªÉ)
    labels = list(set([l for sent in y_true for l in sent]))
    if 'O' in labels: labels.remove('O')
    labels = sorted(labels)
    
    print(f"Accuracy: {metrics.flat_accuracy_score(y_true, y_pred):.4f}")
    print(metrics.flat_classification_report(
        y_true, y_pred, labels=labels, digits=4
    ))

### **NER**

**CRF (Conditional Random Fields)**

In [44]:
print("Training CRF model")
crf = CRFModel(config.get('crf', {}))
crf.train(X_train, y_train)

Training CRF model


In [45]:
# Predict & Evaluate
y_pred_crf = crf.predict(X_test)
evaluate_model(y_test, y_pred_crf, "CRF")


Accuracy: 0.7206
              precision    recall  f1-score   support

      B-AREA     0.6199    0.6136    0.6167       295
      B-ATTR     0.5854    0.4800    0.5275       900
       B-LOC     0.6005    0.6501    0.6243       363
         B-O     0.1515    0.1250    0.1370        40
       B-ORG     0.7500    0.1579    0.2609        38
       B-PER     0.4348    0.2703    0.3333        37
     B-PRICE     0.7803    0.8438    0.8108       160
      B-TYPE     0.6395    0.5528    0.5930       199
      I-AREA     0.7143    0.6229    0.6655       297
      I-ATTR     0.5596    0.5108    0.5341      1296
       I-LOC     0.5957    0.6336    0.6141       393
         I-O     0.1515    0.0806    0.1053        62
       I-ORG     0.7619    0.6957    0.7273        23
       I-PER     0.3750    0.2222    0.2791        27
     I-PRICE     0.7188    0.8789    0.7909       256
      I-TYPE     0.3734    0.3933    0.3831       150

   micro avg     0.5958    0.5560    0.5752      4536
   macro

In [46]:
def process_ner_vectors(X_3d, y_3d):
    X_list = []
    y_list = []
    
    for i in range(len(y_3d)):
        # L·∫•y nh√£n c·ªßa c√¢u th·ª© i
        labels = y_3d[i]
        vectors = X_3d[i]
        
        valid_indices = labels != -100     
        X_sent = vectors[valid_indices]
        y_sent = labels[valid_indices]
        
        if len(y_sent) > 0:
            X_list.append(X_sent)
            y_list.append(y_sent)
            
    return X_list, y_list

print("Processing NER vectors (Removing padding)...")
X_train_ner, y_train_ner = process_ner_vectors(X_train_ner_3d, y_train_ner_3d)
X_test_ner, y_test_ner = process_ner_vectors(X_test_ner_3d, y_test_ner_3d)

print(f"-> NER Train sentences: {len(X_train_ner)}")
print(f"-> Sample sentence length: {len(X_train_ner[0])}")

Processing NER vectors (Removing padding)...
-> NER Train sentences: 797
-> Sample sentence length: 38


**SVM (Support Vector Machine)**

In [47]:
print("Training SVM Model")
svm = FlatModelWrapper('svm', config.get('svm', {}))
svm.train(X_train_ner, y_train_ner)

Training SVM Model


In [62]:
# Predict & Evaluate
y_pred_svm = svm.predict(X_test_ner)
y_test_tags = decode_tags(y_test_ner, id2label)
y_pred_tags = decode_tags(y_pred_svm, id2label)
evaluate_model(y_test_tags, y_pred_tags, "SVM")


Accuracy: 0.7444
              precision    recall  f1-score   support

      B-AREA     0.6455    0.4373    0.5214       279
      B-ATTR     0.7173    0.4287    0.5367       870
       B-LOC     0.6544    0.7458    0.6971       358
         B-O     0.0000    0.0000    0.0000        40
       B-ORG     0.6667    0.1333    0.2222        30
       B-PER     0.6154    0.4571    0.5246        35
     B-PRICE     0.8681    0.8741    0.8711       143
      B-TYPE     0.6244    0.7380    0.6765       187
      I-AREA     0.8045    0.8031    0.8038       579
      I-ATTR     0.6059    0.5885    0.5971      1594
       I-LOC     0.7323    0.6426    0.6845       596
         I-O     0.0000    0.0000    0.0000        74
       I-ORG     0.4737    0.3913    0.4286        46
       I-PER     0.6000    0.3750    0.4615        40
     I-PRICE     0.9437    0.8385    0.8880       260
      I-TYPE     0.5368    0.3400    0.4163       150

   micro avg     0.6842    0.5933    0.6355      5281
   macro

**MaxEnt (Maximum Entropy)**

In [51]:
print("Training MaxEnt Model")
maxent = FlatModelWrapper('maxent', config.get('maxent', {}))
maxent.train(X_train_ner, y_train_ner)

Training MaxEnt Model




In [52]:
y_pred_maxent = maxent.predict(X_test_ner)
y_test_tags = decode_tags(y_test_ner, id2label)
y_pred_tags = decode_tags(y_pred_maxent, id2label)
evaluate_model(y_test_tags, y_pred_tags, "MaxEnt")


Accuracy: 0.7544
              precision    recall  f1-score   support

      B-AREA     0.6512    0.6022    0.6257       279
      B-ATTR     0.7064    0.5862    0.6407       870
       B-LOC     0.6789    0.7207    0.6992       358
         B-O     0.0000    0.0000    0.0000        40
       B-ORG     0.5333    0.2667    0.3556        30
       B-PER     0.5000    0.4571    0.4776        35
     B-PRICE     0.8652    0.8531    0.8592       143
      B-TYPE     0.6875    0.6471    0.6667       187
      I-AREA     0.8229    0.8187    0.8208       579
      I-ATTR     0.6652    0.5571    0.6064      1594
       I-LOC     0.7038    0.6896    0.6966       596
         I-O     0.0938    0.0405    0.0566        74
       I-ORG     0.5000    0.5217    0.5106        46
       I-PER     0.5862    0.4250    0.4928        40
     I-PRICE     0.9106    0.8615    0.8854       260
      I-TYPE     0.4452    0.4600    0.4525       150

   micro avg     0.6972    0.6273    0.6604      5281
   macro

In [55]:
def visualize_ner_predictions(test_sents, y_true, y_pred, num_samples=2):
    print(f"\n{'='*20} NER PREDICTION SAMPLES {'='*20}")
    
    # Ch·ªçn ng·∫´u nhi√™n m·ªôt v√†i c√¢u ƒë·ªÉ hi·ªÉn th·ªã
    indices = random.sample(range(len(test_sents)), min(num_samples, len(test_sents)))
    count = 0
    for idx in indices:
        print(f"\nüìå Sentence #{idx}:")
        tokens = [t[0] for t in test_sents[idx]] 
        true_labels = y_true[idx]
        pred_labels = y_pred[idx]

        results = ['‚úÖ' if t == p else '‚ùå' for t, p in zip(true_labels, pred_labels)]
        count += sum(1 for r in results if r == '‚úÖ')
        print(f"-> Correct Predictions in this sentence: {sum(1 for r in results if r == '‚úÖ')} out of {len(results)} tokens.")
        # T·∫°o DataFrame
        df = pd.DataFrame({
            'Token': tokens,
            'Th·ª±c t·∫ø (True)': true_labels,
            'D·ª± ƒëo√°n (Pred)': pred_labels,
            'K·∫øt qu·∫£': results
        })
        display(df)

In [66]:
def visualize_ner_predictions_ml(test_json, y_true_list, y_pred_list, id2label, tokenizer, num_samples=2):
    
    # Ch·ªçn ng·∫´u nhi√™n index
    indices = random.sample(range(len(y_pred_list)), min(num_samples, len(y_pred_list)))
    
    for idx in indices:
        print(f"\nüìå Sentence #{idx}:")
        
        # 1. L·∫•y text g·ªëc v√† tokenize
        raw_text = test_json[idx]['data']['text']
        tokens = tokenizer.tokenize(raw_text)
        
        # 2. L·∫•y nh√£n th·ª±c t·∫ø v√† d·ª± ƒëo√°n
        true_seq = y_true_list[idx]
        pred_seq = y_pred_list[idx]
        
        # C·∫Øt ng·∫Øn v·ªÅ ƒë·ªô d√†i chung
        min_len = min(len(tokens), len(true_seq), len(pred_seq))
        tokens_show = tokens[:min_len]
        
        # --- PH·∫¶N S·ª¨A L·ªñI: KI·ªÇM TRA KI·ªÇU D·ªÆ LI·ªÜU ---
        # X·ª≠ l√Ω True Label
        true_labels = []
        for x in true_seq[:min_len]:
            # N·∫øu l√† s·ªë (ID) -> Map qua id2label
            if isinstance(x, (int, np.integer, float, np.float64)):
                true_labels.append(id2label[int(x)])
            # N·∫øu l√† chu·ªói (Tag) -> D√πng lu√¥n
            else:
                true_labels.append(str(x))
                
        # X·ª≠ l√Ω Pred Label
        pred_labels = []
        for x in pred_seq[:min_len]:
            if isinstance(x, (int, np.integer, float, np.float64)):
                pred_labels.append(id2label[int(x)])
            else:
                pred_labels.append(str(x))
        # --------------------------------------------

        # 3. So s√°nh k·∫øt qu·∫£
        results = ['‚úÖ' if t == p else '‚ùå' for t, p in zip(true_labels, pred_labels)]
        
        correct = sum(1 for r in results if r == '‚úÖ')
        print(f"-> Text: {raw_text[:100]}...") 
        print(f"-> Correct: {correct}/{len(results)} ({correct/len(results):.1%})")
        
        # 4. T·∫°o DataFrame
        df = pd.DataFrame({
            'Subword': tokens_show,
            'Th·ª±c t·∫ø (True)': true_labels,
            'D·ª± ƒëo√°n (Pred)': pred_labels,
            'K·∫øt qu·∫£': results
        })
        display(df)

In [60]:
visualize_ner_predictions(test_sents, y_test, y_pred_crf, num_samples=2)



üìå Sentence #72:
-> Correct Predictions in this sentence: 37 out of 64 tokens.


Unnamed: 0,Token,Th·ª±c t·∫ø (True),D·ª± ƒëo√°n (Pred),K·∫øt qu·∫£
0,Di·ªán_t√≠ch,O,O,‚úÖ
1,ngang,O,B-AREA,‚ùå
2,29mx22m,B-AREA,I-AREA,‚ùå
3,th·ªï_c∆∞,B-AREA,I-AREA,‚ùå
4,400m2,I-AREA,I-AREA,‚úÖ
...,...,...,...,...
59,gi√°,O,O,‚úÖ
60,ch·ªët,O,O,‚úÖ
61,cho,O,O,‚úÖ
62,c·∫£,O,O,‚úÖ



üìå Sentence #60:
-> Correct Predictions in this sentence: 149 out of 205 tokens.


Unnamed: 0,Token,Th·ª±c t·∫ø (True),D·ª± ƒëo√°n (Pred),K·∫øt qu·∫£
0,Ch√≠nh,O,B-PER,‚ùå
1,ch·ªß,O,I-PER,‚ùå
2,c·∫ßn,O,O,‚úÖ
3,b√°n,O,O,‚úÖ
4,2480m2,B-AREA,B-AREA,‚úÖ
...,...,...,...,...
200,(,O,O,‚úÖ
201,th∆∞∆°ng_l∆∞·ª£ng,B-ATTR,O,‚ùå
202,kh√°ch,O,O,‚úÖ
203,thi·ªán_ch√≠,O,O,‚úÖ


In [73]:
print("\n=== K·∫æT QU·∫¢ D·ª∞ ƒêO√ÅN: SVM ===")
y_pred_svm = svm.predict(X_test_ner)
visualize_ner_predictions_ml(
    test_json,
    y_test_tags,
    y_pred_svm,
    id2label,
    tokenizer,
    num_samples=1
)


=== K·∫æT QU·∫¢ D·ª∞ ƒêO√ÅN: SVM ===

üìå Sentence #82:
-> Text: V·ªã tr√≠: Chung c∆∞ 1A1B Nguy·ªÖn ƒê√¨nh Chi·ªÉu, ph∆∞·ªùng ƒêa Kao, Qu·∫≠n 1 ngay trung t√¢m, thu·∫≠n ti·ªán di chuy·ªÉn ...
-> Correct: 98/126 (77.8%)


Unnamed: 0,Subword,Th·ª±c t·∫ø (True),D·ª± ƒëo√°n (Pred),K·∫øt qu·∫£
0,V·ªã,O,O,‚úÖ
1,tr√≠@@,O,O,‚úÖ
2,:,B-TYPE,B-LOC,‚ùå
3,Chung,B-LOC,I-LOC,‚ùå
4,c∆∞,I-LOC,I-LOC,‚úÖ
...,...,...,...,...
121,to√†@@,O,O,‚úÖ
122,n@@,B-PRICE,B-PRICE,‚úÖ
123,",",I-PRICE,I-PRICE,‚úÖ
124,b·ªÅn,O,O,‚úÖ


In [69]:
print("\n=== K·∫æT QU·∫¢ D·ª∞ ƒêO√ÅN: MAXENT ===")
y_pred_maxent = maxent.predict(X_test_ner)
visualize_ner_predictions_ml(
    test_json,          # Text g·ªëc
    y_test_tags,    # Nh√£n th·∫≠t (ID)
    y_pred_maxent,         # Nh√£n d·ª± ƒëo√°n (ID)
    id2label,           # B·∫£ng map (0->B-LOC)
    tokenizer,          # Tokenizer PhoBERT
    num_samples=1
)


=== K·∫æT QU·∫¢ D·ª∞ ƒêO√ÅN: MAXENT ===

üìå Sentence #79:
-> Text: Di·ªán t√≠ch: 4x17m (CN 68m2) K·∫øt c·∫•u: Nh√† c·∫•p 4 ti·ªán x√¢y m·ªõi Quy ho·∫°ch x√¢y d·ª±ng: H·∫ßm, tr·ªát, l·ª≠ng, 3 t·∫ß...
-> Correct: 35/40 (87.5%)


Unnamed: 0,Subword,Th·ª±c t·∫ø (True),D·ª± ƒëo√°n (Pred),K·∫øt qu·∫£
0,Di·ªán,O,O,‚úÖ
1,t√≠ch@@,O,O,‚úÖ
2,:,B-AREA,I-AREA,‚ùå
3,4x@@,I-AREA,I-AREA,‚úÖ
4,17@@,I-AREA,I-AREA,‚úÖ
5,m,O,O,‚úÖ
6,(@@,B-AREA,B-AREA,‚úÖ
7,CN,I-AREA,I-AREA,‚úÖ
8,68@@,I-AREA,I-AREA,‚úÖ
9,m@@,O,O,‚úÖ


### **RE**

In [82]:
print("Creating pairs from Training Data...")
train_pairs = prepare_re_data_from_json(train_json)
test_pairs = prepare_re_data_from_json(test_json)

print(f"-> Total Train Pairs: {len(train_pairs)}")
print(f"-> Total Test Pairs: {len(test_pairs)}")

all_labels = sorted(list(set([p['label'] for p in train_pairs])))
re_id2label = {i: label for i, label in enumerate(all_labels)}
print("Label Mapping:", re_id2label)

Creating pairs from Training Data...
-> Total Train Pairs: 277036
-> Total Test Pairs: 47950
Label Mapping: {0: 'HAS_AREA', 1: 'HAS_ATTR', 2: 'HAS_PRICE', 3: 'LOCATED_AT', 4: 'NO_RELATION'}


In [74]:
print("Training SVM for RE")
svm = RelationExtractionModel('svm', config.get('svm', {}))
svm.train(X_train_re, y_train_re)

Training SVM for RE
Model SVM: Using Pre-computed PhoBERT Vectors


In [131]:
print("\n--- SVM Evaluation ---")
y_pred_svm = svm.predict(X_test_re)
acc = accuracy_score(y_test_re, y_pred_svm)
class_ids = list(svm.model.classes_)

no_rel_id = None
for idx, label in re_id2label.items():
    if label == 'NO_RELATION':
        no_rel_id = idx
        break
if no_rel_id is not None and no_rel_id in class_ids:
    report_ids = [i for i in class_ids if i != no_rel_id]
else:
    report_ids = class_ids

target_names = [re_id2label[i] for i in report_ids]

print(f"Accuracy: {acc:.4f} ({acc*100:.2f}%)")
print(classification_report(
    y_test_re, 
    y_pred_svm, 
    labels=report_ids, 
    target_names=target_names, 
    digits=4
))


--- SVM Evaluation ---
Accuracy: 0.9658 (96.58%)
              precision    recall  f1-score   support

    HAS_AREA     0.0000    0.0000    0.0000       273
    HAS_ATTR     0.4167    0.0054    0.0107       919
   HAS_PRICE     0.0000    0.0000    0.0000       146
  LOCATED_AT     0.0000    0.0000    0.0000       297

   micro avg     0.3125    0.0031    0.0061      1635
   macro avg     0.1042    0.0014    0.0027      1635
weighted avg     0.2342    0.0031    0.0060      1635



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [76]:
print("Training Maxent for RE")
maxent = RelationExtractionModel('maxent', config.get('maxent', {}))
maxent.train(X_train_re, y_train_re)

Training Maxent for RE
Model MAXENT: Using Pre-computed PhoBERT Vectors




In [132]:
print("\n--- Maxent Evaluation ---")
y_pred_maxent = maxent.predict(X_test_re)
acc = accuracy_score(y_test_re, y_pred_maxent)

class_ids = list(maxent.model.classes_)

no_rel_id = None
for idx, label in re_id2label.items():
    if label == 'NO_RELATION':
        no_rel_id = idx
        break
if no_rel_id is not None and no_rel_id in class_ids:
    report_ids = [i for i in class_ids if i != no_rel_id]
else:
    report_ids = class_ids

target_names = [re_id2label[i] for i in report_ids]

print(f"Accuracy: {acc:.4f} ({acc*100:.2f}%)")
print(classification_report(
    y_test_re, 
    y_pred_svm, 
    labels=report_ids, 
    target_names=target_names, 
    digits=4
))


--- Maxent Evaluation ---
Accuracy: 0.9319 (93.19%)
              precision    recall  f1-score   support

    HAS_AREA     0.0000    0.0000    0.0000       273
    HAS_ATTR     0.4167    0.0054    0.0107       919
   HAS_PRICE     0.0000    0.0000    0.0000       146
  LOCATED_AT     0.0000    0.0000    0.0000       297

   micro avg     0.3125    0.0031    0.0061      1635
   macro avg     0.1042    0.0014    0.0027      1635
weighted avg     0.2342    0.0031    0.0060      1635



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [78]:
print("\nTraining Random Forest for RE")
rf = RelationExtractionModel('random_forest', config.get('random_forest', {}))
rf.train(X_train_re, y_train_re)


Training Random Forest for RE
Model RANDOM_FOREST: Using Pre-computed PhoBERT Vectors


In [133]:
print("\n--- Random Forest Evaluation ---")
y_pred_rf = rf.predict(X_test_re)
acc = accuracy_score(y_test_re, y_pred_rf)

class_ids = list(rf.model.classes_)

no_rel_id = None
for idx, label in re_id2label.items():
    if label == 'NO_RELATION':
        no_rel_id = idx
        break
if no_rel_id is not None and no_rel_id in class_ids:
    report_ids = [i for i in class_ids if i != no_rel_id]
else:
    report_ids = class_ids

target_names = [re_id2label[i] for i in report_ids]

print(f"Accuracy: {acc:.4f} ({acc*100:.2f}%)")
print(classification_report(
    y_test_re, 
    y_pred_svm, 
    labels=report_ids, 
    target_names=target_names, 
    digits=4
))


--- Random Forest Evaluation ---
Accuracy: 0.9659 (96.59%)
              precision    recall  f1-score   support

    HAS_AREA     0.0000    0.0000    0.0000       273
    HAS_ATTR     0.4167    0.0054    0.0107       919
   HAS_PRICE     0.0000    0.0000    0.0000       146
  LOCATED_AT     0.0000    0.0000    0.0000       297

   micro avg     0.3125    0.0031    0.0061      1635
   macro avg     0.1042    0.0014    0.0027      1635
weighted avg     0.2342    0.0031    0.0060      1635



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [134]:
def visualize_re_model_comparison(test_pairs, y_true_ids, preds_dict, id2label, num_samples=10):
    print(f"\n{'='*30} RE MODEL COMPARISON {'='*30}")
    
    # Ch·ªçn index ng·∫´u nhi√™n
    indices = random.sample(range(len(test_pairs)), min(num_samples, len(test_pairs)))
    
    data = []
    for idx in indices:
        pair = test_pairs[idx]
        ent1 = pair['ent1']['text']
        ent2 = pair['ent2']['text']
        
        # L·∫•y nh√£n th·ª±c t·∫ø (Map t·ª´ ID sang Ch·ªØ)
        true_id = y_true_ids[idx]
        true_label = id2label[true_id]
        
        row = {
            'C·∫∑p th·ª±c th·ªÉ': f"{ent1} ‚û° {ent2}",
            'Th·ª±c t·∫ø': true_label
        }
        
        # Duy·ªát qua c√°c model
        for model_name, y_pred in preds_dict.items():
            pred_id = y_pred[idx]
            pred_label = id2label[pred_id]
            
            status = "‚úÖ" if pred_id == true_id else "‚ùå"
            row[model_name] = f"{pred_label} {status}"
            
        data.append(row)
    
    df = pd.DataFrame(data)
    display(df)

In [None]:
preds_dict = {
    'SVM': y_pred_svm,
    'MaxEnt': y_pred_maxent,
    'RandomForest': y_pred_rf
}

visualize_re_model_comparison(test_pairs, y_test_re, preds_dict, re_id2label, num_samples=15)




Unnamed: 0,C·∫∑p th·ª±c th·ªÉ,Th·ª±c t·∫ø,SVM,MaxEnt,RandomForest
0,3 ng·ªß ‚û° Vinh,NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
1,Nam H·ªôi An ‚û° Anh Ch·ªã,NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
2,l√†m t√†i s·∫£n l√¢u d√†i ‚û° ƒë·∫ßu t∆∞,NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
3,khu Nam Long II ‚û° Gi√° b√°n,NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
4,"95,39m¬≤ ‚û° 94,97m¬≤",NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
5,bi·ªát th·ª± ‚û° l√¥ g√≥c,HAS_ATTR,NO_RELATION ‚ùå,NO_RELATION ‚ùå,NO_RELATION ‚ùå
6,G√≤ D·∫ßu ‚û° T√¢y Ninh,NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
7,m·∫∑t ti·ªÅn T·ªânh L·ªô 8 ‚û° Ti·ªán √≠ch xung quanh thu·∫≠n...,NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
8,"S1616 ‚û° 4, x t·ª∑",NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
9,ƒê·∫ßu T∆∞ Ngon ‚û° nhu c·∫ßu thu√™ ·ªü cao,NO_RELATION,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ,NO_RELATION ‚úÖ
