**Library**

In [36]:
import sys
import os
import yaml
import json
import pandas as pd
import random
from sklearn_crfsuite import metrics
from sklearn.metrics import classification_report, accuracy_score

In [2]:
sys.path.append(os.path.abspath(os.path.join('..')))
from src.features.hand_crafted import sent2features, sent2labels, sent2tokens, get_relation_features
from src.models.machine_learning import CRFModel, FlatModelWrapper
from src.data_loader.dataset import convert_label_studio_to_ner_data, prepare_re_data_from_json
from sklearn.ensemble import RandomForestClassifier

**Load Config**

In [3]:
with open('../configs/ml_config.yaml', 'r') as f:
    config = yaml.safe_load(f)

**Load Data**

In [4]:
print("Loading data files...")
TRAIN_PATH = '../data/04_model_input/train_dataset.json'
TEST_PATH = '../data/04_model_input/test_dataset.json'
DEV_PATH = '../data/04_model_input/dev_dataset.json'

with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
    train_json = json.load(f)

with open(TEST_PATH, 'r', encoding='utf-8') as f:
    test_json = json.load(f)

with open(DEV_PATH, 'r', encoding='utf-8') as f:
    dev_json = json.load(f)

print(f"-> Train raw docs: {len(train_json)}")
print(f"-> Test raw docs: {len(test_json)}")
print(f"-> Dev raw docs: {len(dev_json)}")

Loading data files...
-> Train raw docs: 797
-> Test raw docs: 114
-> Dev raw docs: 228


In [5]:
print("\nConverting to BIO format...")
train_sents = convert_label_studio_to_ner_data(train_json)
test_sents = convert_label_studio_to_ner_data(test_json)
dev_sents = convert_label_studio_to_ner_data(dev_json)


Converting to BIO format...


Converting with Pyvi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 797/797 [00:01<00:00, 764.37it/s]
Converting with Pyvi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 114/114 [00:00<00:00, 865.15it/s]
Converting with Pyvi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 228/228 [00:00<00:00, 807.05it/s]


In [6]:
train_sents[0][:20]

[('B√°n', 'O'),
 ('l√¥', 'B-TYPE'),
 ('ƒë·∫•t', 'I-TYPE'),
 ('100m¬≤', 'B-AREA'),
 ('ngay', 'O'),
 ('c·∫°nh', 'O'),
 ('nh√†', 'O'),
 ('thi_ƒë·∫•u', 'B-ATTR'),
 ('Th·ªß_Th·ª´a', 'I-ATTR'),
 (',', 'O'),
 ('view', 'B-ATTR'),
 ('h·ªì', 'I-ATTR'),
 ('sinh_th√°i', 'I-ATTR'),
 ('m√°t_m·∫ª', 'I-ATTR'),
 (',', 'O'),
 ('gi√°', 'O'),
 ('m·ªÅm', 'O'),
 (',', 'O'),
 ('x√¢y_d·ª±ng', 'O'),
 ('·ªü', 'O')]

**Extracting features**

In [7]:
print("\nExtracting features")
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

print(f"-> X_train shape: {len(X_train)} sentences")
print(f"-> X_test shape: {len(X_test)} sentences")


Extracting features
-> X_train shape: 797 sentences
-> X_test shape: 114 sentences


In [8]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n{'='*20} EVALUATION: {model_name} {'='*20}")
    
    # L·∫•y danh s√°ch nh√£n th·ª±c t·∫ø (tr·ª´ O ƒë·ªÉ F1-score ph·∫£n √°nh ƒë√∫ng th·ª±c th·ªÉ)
    labels = list(set([l for sent in y_true for l in sent]))
    if 'O' in labels: labels.remove('O')
    labels = sorted(labels)
    
    print(f"Accuracy: {metrics.flat_accuracy_score(y_true, y_pred):.4f}")
    print(metrics.flat_classification_report(
        y_true, y_pred, labels=labels, digits=4
    ))

### **NER**

**CRF (Conditional Random Fields)**

In [9]:
print("Training CRF model")
crf = CRFModel(config.get('crf', {}))
crf.train(X_train, y_train)

Training CRF model


In [10]:
# Predict & Evaluate
y_pred_crf = crf.predict(X_test)
evaluate_model(y_test, y_pred_crf, "CRF")


Accuracy: 0.7206
              precision    recall  f1-score   support

      B-AREA     0.6199    0.6136    0.6167       295
      B-ATTR     0.5854    0.4800    0.5275       900
       B-LOC     0.6005    0.6501    0.6243       363
         B-O     0.1515    0.1250    0.1370        40
       B-ORG     0.7500    0.1579    0.2609        38
       B-PER     0.4348    0.2703    0.3333        37
     B-PRICE     0.7803    0.8438    0.8108       160
      B-TYPE     0.6395    0.5528    0.5930       199
      I-AREA     0.7143    0.6229    0.6655       297
      I-ATTR     0.5596    0.5108    0.5341      1296
       I-LOC     0.5957    0.6336    0.6141       393
         I-O     0.1515    0.0806    0.1053        62
       I-ORG     0.7619    0.6957    0.7273        23
       I-PER     0.3750    0.2222    0.2791        27
     I-PRICE     0.7188    0.8789    0.7909       256
      I-TYPE     0.3734    0.3933    0.3831       150

   micro avg     0.5958    0.5560    0.5752      4536
   macro

In [11]:
# OUTPUT_MODEL_DIR = '../outputs/models' 
# os.makedirs(OUTPUT_MODEL_DIR, exist_ok=True)

# # L∆∞u model
# crf.save(os.path.join(OUTPUT_MODEL_DIR, 'crf_model.pkl'))

**SVM (Support Vector Machine)**

In [12]:
print("Training SVM Model")
svm = FlatModelWrapper('svm', config.get('svm', {}))
svm.train(X_train, y_train)

Training SVM Model


In [13]:
# Predict & Evaluate
y_pred_svm = svm.predict(X_test)
evaluate_model(y_test, y_pred_svm, "SVM")


Accuracy: 0.7439
              precision    recall  f1-score   support

      B-AREA     0.6278    0.6576    0.6424       295
      B-ATTR     0.5922    0.5422    0.5661       900
       B-LOC     0.6677    0.6198    0.6429       363
         B-O     0.2000    0.1000    0.1333        40
       B-ORG     0.5714    0.1053    0.1778        38
       B-PER     0.4167    0.1351    0.2041        37
     B-PRICE     0.8070    0.8625    0.8338       160
      B-TYPE     0.6465    0.6432    0.6448       199
      I-AREA     0.6704    0.6094    0.6384       297
      I-ATTR     0.6077    0.5355    0.5693      1296
       I-LOC     0.6061    0.6616    0.6326       393
         I-O     0.1176    0.0323    0.0506        62
       I-ORG     0.5714    0.1739    0.2667        23
       I-PER     0.5000    0.3333    0.4000        27
     I-PRICE     0.7419    0.8984    0.8127       256
      I-TYPE     0.4819    0.5333    0.5063       150

   micro avg     0.6245    0.5833    0.6032      4536
   macro

In [14]:
# OUTPUT_MODEL_DIR = '../outputs/models' 
# os.makedirs(OUTPUT_MODEL_DIR, exist_ok=True)

# # L∆∞u model
# svm.save(os.path.join(OUTPUT_MODEL_DIR, 'svm_model.pkl'))

**MaxEnt (Maximum Entropy)**

In [15]:
print("Training MaxEnt Model")
maxent = FlatModelWrapper('maxent', config.get('maxent', {}))
maxent.train(X_train, y_train)

Training MaxEnt Model




In [16]:
y_pred_maxent = maxent.predict(X_test)
evaluate_model(y_test, y_pred_maxent, "MaxEnt")


Accuracy: 0.7291
              precision    recall  f1-score   support

      B-AREA     0.6242    0.6475    0.6356       295
      B-ATTR     0.6197    0.4744    0.5374       900
       B-LOC     0.5878    0.6639    0.6235       363
         B-O     0.1143    0.1000    0.1067        40
       B-ORG     0.6000    0.0789    0.1395        38
       B-PER     0.5000    0.3514    0.4127        37
     B-PRICE     0.7500    0.8438    0.7941       160
      B-TYPE     0.6519    0.5930    0.6211       199
      I-AREA     0.7160    0.6027    0.6545       297
      I-ATTR     0.6024    0.5085    0.5515      1296
       I-LOC     0.6081    0.6081    0.6081       393
         I-O     0.2273    0.0806    0.1190        62
       I-ORG     0.6667    0.2609    0.3750        23
       I-PER     0.4375    0.2593    0.3256        27
     I-PRICE     0.7324    0.8555    0.7892       256
      I-TYPE     0.4313    0.4600    0.4452       150

   micro avg     0.6172    0.5545    0.5841      4536
   macro

In [17]:
# OUTPUT_MODEL_DIR = '../outputs/models' 
# os.makedirs(OUTPUT_MODEL_DIR, exist_ok=True)

# # L∆∞u model
# maxent.save(os.path.join(OUTPUT_MODEL_DIR, 'maxent_model.pkl'))

In [67]:
def visualize_ner_predictions(test_sents, y_true, y_pred, num_samples=2):
    print(f"\n{'='*20} NER PREDICTION SAMPLES {'='*20}")
    
    # Ch·ªçn ng·∫´u nhi√™n m·ªôt v√†i c√¢u ƒë·ªÉ hi·ªÉn th·ªã
    indices = random.sample(range(len(test_sents)), min(num_samples, len(test_sents)))
    count = 0
    for idx in indices:
        print(f"\nüìå Sentence #{idx}:")
        tokens = [t[0] for t in test_sents[idx]] 
        true_labels = y_true[idx]
        pred_labels = y_pred[idx]

        results = ['‚úÖ' if t == p else '‚ùå' for t, p in zip(true_labels, pred_labels)]
        count += sum(1 for r in results if r == '‚úÖ')
        print(f"-> Correct Predictions in this sentence: {sum(1 for r in results if r == '‚úÖ')} out of {len(results)} tokens.")
        # T·∫°o DataFrame
        df = pd.DataFrame({
            'Token': tokens,
            'Th·ª±c t·∫ø (True)': true_labels,
            'D·ª± ƒëo√°n (Pred)': pred_labels,
            'K·∫øt qu·∫£': results
        })
        display(df)

In [66]:
print("\nVisualizing NER Predictions for CRF Model")
visualize_ner_predictions(test_sents, y_test, y_pred_crf)


Visualizing NER Predictions for CRF Model


üìå Sentence #65:
-> Correct Predictions in this sentence: 35 out of 58 tokens.


Unnamed: 0,Token,Th·ª±c t·∫ø (True),D·ª± ƒëo√°n (Pred),K·∫øt qu·∫£
0,ƒê·∫•t,B-TYPE,B-TYPE,‚úÖ
1,ng·ªôp,O,O,‚úÖ
2,ch·ªß,B-PER,O,‚ùå
3,c·∫ßn,O,O,‚úÖ
4,ti·ªÅn,O,O,‚úÖ
5,g·∫•p,O,O,‚úÖ
6,Di·ªán_t√≠ch,O,O,‚úÖ
7,2084m2,B-AREA,B-AREA,‚úÖ
8,ngang,B-AREA,I-AREA,‚ùå
9,20m,I-AREA,I-AREA,‚úÖ



üìå Sentence #37:
-> Correct Predictions in this sentence: 47 out of 60 tokens.


Unnamed: 0,Token,Th·ª±c t·∫ø (True),D·ª± ƒëo√°n (Pred),K·∫øt qu·∫£
0,N·∫±m,O,O,‚úÖ
1,ngay,B-ATTR,O,‚ùå
2,g·∫ßn,I-ATTR,B-ATTR,‚ùå
3,ch·ª£,I-ATTR,I-ATTR,‚úÖ
4,B√†,I-ATTR,I-ATTR,‚úÖ
5,Chi·ªÉu,I-ATTR,O,‚ùå
6,",",O,O,‚úÖ
7,h∆∞·ªõng,B-ATTR,O,‚ùå
8,ra,I-ATTR,O,‚ùå
9,C·∫ßu,I-ATTR,B-LOC,‚ùå


In [69]:
print("\nVisualizing NER Predictions for SVM Model")
visualize_ner_predictions(test_sents, y_test, y_pred_svm)


Visualizing NER Predictions for SVM Model


üìå Sentence #45:
-> Correct Predictions in this sentence: 80 out of 94 tokens.


Unnamed: 0,Token,Th·ª±c t·∫ø (True),D·ª± ƒëo√°n (Pred),K·∫øt qu·∫£
0,Nguy·ªÖn_Th∆∞·ª£ng_Hi·ªÅn,B-LOC,B-LOC,‚úÖ
1,",",O,O,‚úÖ
2,ph∆∞·ªùng,B-LOC,B-LOC,‚úÖ
3,B√¨nh_L·ª£i_Trung,I-LOC,I-LOC,‚úÖ
4,",",O,O,‚úÖ
...,...,...,...,...
89,c√≥,O,O,‚úÖ
90,t√¢m,O,O,‚úÖ
91,v√†,O,O,‚úÖ
92,c√≥,O,O,‚úÖ



üìå Sentence #77:
-> Correct Predictions in this sentence: 68 out of 82 tokens.


Unnamed: 0,Token,Th·ª±c t·∫ø (True),D·ª± ƒëo√°n (Pred),K·∫øt qu·∫£
0,M·ªü,O,O,‚úÖ
1,b√°n,O,O,‚úÖ
2,ƒë·∫•t,B-TYPE,B-TYPE,‚úÖ
3,n·ªÅn,I-TYPE,I-TYPE,‚úÖ
4,khu,B-LOC,B-LOC,‚úÖ
...,...,...,...,...
77,trung_t√¢m,B-ATTR,B-ATTR,‚úÖ
78,th∆∞∆°ng_m·∫°i,I-ATTR,I-ATTR,‚úÖ
79,(,O,O,‚úÖ
80,Minh_Thi·ªán,B-PER,B-PER,‚úÖ


In [68]:
print("\nVisualizing NER Predictions for Maxent Model")
visualize_ner_predictions(test_sents, y_test, y_pred_maxent)


Visualizing NER Predictions for Maxent Model


üìå Sentence #42:
-> Correct Predictions in this sentence: 49 out of 62 tokens.


Unnamed: 0,Token,Th·ª±c t·∫ø (True),D·ª± ƒëo√°n (Pred),K·∫øt qu·∫£
0,25,B-PRICE,B-PRICE,‚úÖ
1,t·ª∑,I-PRICE,I-PRICE,‚úÖ
2,Ngay,B-ATTR,O,‚ùå
3,b√™n,I-ATTR,I-ATTR,‚úÖ
4,KCN,I-ATTR,I-ATTR,‚úÖ
...,...,...,...,...
57,l√†m,I-ATTR,O,‚ùå
58,nh√†_v∆∞·ªùn,I-ATTR,I-ATTR,‚úÖ
59,",",O,O,‚úÖ
60,x√¢y,B-ATTR,B-ATTR,‚úÖ



üìå Sentence #74:
-> Correct Predictions in this sentence: 39 out of 50 tokens.


Unnamed: 0,Token,Th·ª±c t·∫ø (True),D·ª± ƒëo√°n (Pred),K·∫øt qu·∫£
0,Gia_ƒë√¨nh,O,O,‚úÖ
1,c·∫ßn,O,O,‚úÖ
2,b√°n,O,O,‚úÖ
3,g·∫•p,O,O,‚úÖ
4,2,O,O,‚úÖ
5,mi·∫øng,O,O,‚úÖ
6,ƒë·∫•t,B-TYPE,B-TYPE,‚úÖ
7,(,O,O,‚úÖ
8,5x27m,B-AREA,B-AREA,‚úÖ
9,),O,O,‚úÖ


### **RE**

In [18]:
print("Creating pairs from Training Data...")
# L∆∞u √Ω: train_json, test_json ƒë√£ ƒë∆∞·ª£c load ·ªü ph·∫ßn NER b√™n tr√™n
train_pairs = prepare_re_data_from_json(train_json)
test_pairs = prepare_re_data_from_json(test_json)

print(f"-> Total Train Pairs: {len(train_pairs)}")
print(f"-> Total Test Pairs: {len(test_pairs)}")

# 3. Tr√≠ch xu·∫•t ƒë·∫∑c tr∆∞ng & Vector h√≥a
print("\nExtracting features for Pairs...")

Creating pairs from Training Data...
-> Total Train Pairs: 277036
-> Total Test Pairs: 47950

Extracting features for Pairs...


In [19]:
def extract_re_features(pairs):
    X = []
    y = []
    for p in pairs:
        feats = get_relation_features(p['text'], p['ent1'], p['ent2'])
        X.append(feats)
        y.append(p['label'])
    return X, y

In [20]:
X_train_re_dict, y_train_re = extract_re_features(train_pairs)
X_test_re_dict, y_test_re = extract_re_features(test_pairs)

In [22]:
from sklearn.feature_extraction import DictVectorizer

In [23]:
re_vectorizer = DictVectorizer(sparse=True)
X_train_re = re_vectorizer.fit_transform(X_train_re_dict)
X_test_re = re_vectorizer.transform(X_test_re_dict)

print(f"RE Feature Matrix Shape: {X_train_re.shape}")

RE Feature Matrix Shape: (277036, 92)


In [24]:
# 4. Train Model RE (Random Forest)
print("\nTraining Random Forest for RE")
re_model = RandomForestClassifier(n_estimators=100, random_state=42)
re_model.fit(X_train_re, y_train_re)


Training Random Forest for RE


In [None]:
print("\n--- RE EVALUATION ---")
y_pred_re = re_model.predict(X_test_re)
acc = accuracy_score(y_test_re, y_pred_re)

labels_re = list(re_model.classes_)
if 'NO_RELATION' in labels_re:
    labels_re.remove('NO_RELATION')

print(f"Accuracy: {acc:.4f} ({acc*100:.2f}%)")
print("-" * 30)
print("Relation Extraction Performance:")
print(classification_report(y_test_re, y_pred_re, labels=labels_re))


--- RE EVALUATION ---
-> Overall Accuracy: 0.9760 (97.60%)
------------------------------
Relation Extraction Performance:
              precision    recall  f1-score   support

    HAS_AREA       0.59      0.59      0.59       273
    HAS_ATTR       0.64      0.68      0.66       919
   HAS_PRICE       0.53      0.55      0.54       146
  LOCATED_AT       0.68      0.74      0.71       297

   micro avg       0.63      0.66      0.65      1635
   macro avg       0.61      0.64      0.62      1635
weighted avg       0.63      0.66      0.65      1635



In [71]:
def visualize_re_predictions(test_pairs, y_true, y_pred, num_samples=10):
    print(f"\n{'='*20} RE PREDICTION SAMPLES {'='*20}")
    
    indices = random.sample(range(len(test_pairs)), min(num_samples, len(test_pairs)))
    
    data = []
    for idx in indices:
        pair = test_pairs[idx]
        ent1_text = pair['ent1']['text']
        ent2_text = pair['ent2']['text']
        
        # Format c·ªôt ƒë·∫ßu ti√™n: "S·ªï ƒë·ªè -> Ch√≠nh ch·ªß"
        pair_str = f"{ent1_text} -> {ent2_text}"
        
        data.append({
            'C·∫∑p th·ª±c th·ªÉ (Entity Pair)': pair_str,
            'Th·ª±c t·∫ø (True)': y_true[idx],
            'D·ª± ƒëo√°n (Pred)': y_pred[idx],
            'K·∫øt qu·∫£': '‚úÖ' if y_true[idx] == y_pred[idx] else '‚ùå'
        })
    
    df = pd.DataFrame(data)
    
    try:
        from IPython.display import display
        display(df)
    except:
        print(df)

In [94]:
visualize_re_predictions(test_pairs, y_test_re, y_pred_re)




Unnamed: 0,C·∫∑p th·ª±c th·ªÉ (Entity Pair),Th·ª±c t·∫ø (True),D·ª± ƒëo√°n (Pred),K·∫øt qu·∫£
0,Khu v∆∞·ªùn th·∫ßn ti√™n -> Nh·∫≠n di·ªán khu√¢n m·∫∑t AI,NO_RELATION,NO_RELATION,‚úÖ
1,ƒë·∫•t -> C√°ch cao t·ªëc Th√†nh ph·ªë H·ªì Ch√≠ Minh,HAS_ATTR,HAS_ATTR,‚úÖ
2,"S1411 -> 4, x t·ª∑",NO_RELATION,NO_RELATION,‚úÖ
3,huy·ªán B·∫øn C·∫ßu -> Huy·ªán B·∫øn C·∫ßu,NO_RELATION,NO_RELATION,‚úÖ
4,ƒê√¥n Thu·∫≠n -> ƒê∆∞·ªùng nh·ª±a tr∆∞·ªõc ƒë·∫•t 6m,NO_RELATION,NO_RELATION,‚úÖ
5,cƒÉn h·ªô -> b·ªánh vi·ªán,HAS_ATTR,HAS_ATTR,‚úÖ
6,23 t·ª∑ -> CƒÉn Thi√™n Nga,NO_RELATION,NO_RELATION,‚úÖ
7,100m¬≤ -> Shophouse ƒë·∫°i l·ªô 51m,NO_RELATION,NO_RELATION,‚úÖ
8,Maxhomes -> 98m¬≤,NO_RELATION,NO_RELATION,‚úÖ
9,"S√†n giao d·ªãch b·∫•t ƒë·ªông s·∫£n Maxhomes SH4 -> 4,5m",NO_RELATION,NO_RELATION,‚úÖ
