**Library**

In [27]:
import sys
import os
import yaml
import json
import pandas as pd
import random
import numpy as np
import gc
from sklearn_crfsuite import metrics

In [28]:
sys.path.append(os.path.abspath(os.path.join('..')))
from src.features.hand_crafted import sent2features, sent2labels, sent2tokens, get_relation_features
from src.models.machine_learning import CRFModel, FlatModelWrapper, RelationExtractionModel
from src.data_loader.dataset import convert_label_studio_to_ner_data, prepare_re_data_from_json
from sklearn.ensemble import RandomForestClassifier
from transformers import AutoTokenizer
from sklearn_crfsuite import metrics
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

Load Config

In [29]:
with open('../configs/ml_config.yaml', 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)
for model_name in ['svm', 'maxent', 'random_forest']:
    if model_name in config:
        config[model_name]['vectorizer'] = 'phobert'

print("Config updated for PhoBERT vectors!")

Config updated for PhoBERT vectors!


Load Vector

In [30]:
VECTOR_DIR = '../data/vectors/' 
RAW_DATA_DIR = '../data/04_model_input/'

print(f"Loading Data from {VECTOR_DIR}...")

# --- 1. LOAD VECTOR FILES ---
# NER Data
X_train_ner_3d = np.load(os.path.join(VECTOR_DIR, 'X_train_ner_phobert.npy'))
y_train_ner_3d = np.load(os.path.join(VECTOR_DIR, 'y_train_ner_phobert.npy'))
X_test_ner_3d  = np.load(os.path.join(VECTOR_DIR, 'X_test_ner_phobert.npy'))
y_test_ner_3d  = np.load(os.path.join(VECTOR_DIR, 'y_test_ner_phobert.npy'))

# RE Data
X_train_re = np.load(os.path.join(VECTOR_DIR, 'X_train_re_phobert.npy'))
y_train_re = np.load(os.path.join(VECTOR_DIR, 'y_train_re_phobert.npy'))
X_test_re  = np.load(os.path.join(VECTOR_DIR, 'X_test_re_phobert.npy'))
y_test_re  = np.load(os.path.join(VECTOR_DIR, 'y_test_re_phobert.npy'))

print("Vectors Loaded Successfully!")

print("Re-creating Label Mappings from Raw Data...")

with open(os.path.join(RAW_DATA_DIR, 'train_dataset.json'), 'r', encoding='utf-8') as f:
    train_json = json.load(f)

ner_data_raw = convert_label_studio_to_ner_data(train_json)
ner_labels_set = set()
for sent in ner_data_raw:
    for token, label in sent:
        ner_labels_set.add(label)


ner_labels = sorted(list(ner_labels_set))
ner_id2label = {i: label for i, label in enumerate(ner_labels)}
print(f"NER Mapping ({len(ner_id2label)} tags): {ner_id2label}")

re_data_raw = prepare_re_data_from_json(train_json)

re_labels_set = set(item['label'] for item in re_data_raw)
re_labels = sorted(list(re_labels_set))
re_id2label = {i: label for i, label in enumerate(re_labels)}

print(f"RE Mapping ({len(re_id2label)} classes): {re_id2label}")

Loading Data from ../data/vectors/...
Vectors Loaded Successfully!
Re-creating Label Mappings from Raw Data...


Converting with Pyvi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 797/797 [00:01<00:00, 726.68it/s]


NER Mapping (17 tags): {0: 'B-AREA', 1: 'B-ATTR', 2: 'B-LOC', 3: 'B-O', 4: 'B-ORG', 5: 'B-PER', 6: 'B-PRICE', 7: 'B-TYPE', 8: 'I-AREA', 9: 'I-ATTR', 10: 'I-LOC', 11: 'I-O', 12: 'I-ORG', 13: 'I-PER', 14: 'I-PRICE', 15: 'I-TYPE', 16: 'O'}
RE Mapping (5 classes): {0: 'HAS_AREA', 1: 'HAS_ATTR', 2: 'HAS_PRICE', 3: 'LOCATED_AT', 4: 'NO_RELATION'}


Load Data

In [31]:
print("Loading data files...")
TRAIN_PATH = '../data/04_model_input/train_dataset.json'
TEST_PATH = '../data/04_model_input/test_dataset.json'
DEV_PATH = '../data/04_model_input/dev_dataset.json'

with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
    train_json = json.load(f)

with open(TEST_PATH, 'r', encoding='utf-8') as f:
    test_json = json.load(f)

with open(DEV_PATH, 'r', encoding='utf-8') as f:
    dev_json = json.load(f)

print(f"-> Train raw docs: {len(train_json)}")
print(f"-> Test raw docs: {len(test_json)}")
print(f"-> Dev raw docs: {len(dev_json)}")

Loading data files...
-> Train raw docs: 797
-> Test raw docs: 114
-> Dev raw docs: 228


In [32]:
print("\nConverting to BIO format...")
train_sents = convert_label_studio_to_ner_data(train_json)
test_sents = convert_label_studio_to_ner_data(test_json)
dev_sents = convert_label_studio_to_ner_data(dev_json)


Converting to BIO format...


Converting with Pyvi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 797/797 [00:01<00:00, 711.12it/s]
Converting with Pyvi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 114/114 [00:00<00:00, 772.99it/s]
Converting with Pyvi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 228/228 [00:00<00:00, 707.57it/s]


In [33]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
all_labels = sorted(list(set([label for sent in train_sents for token, label in sent])))
id2label = {i: label for i, label in enumerate(all_labels)}
print("Label Map:", id2label)

Label Map: {0: 'B-AREA', 1: 'B-ATTR', 2: 'B-LOC', 3: 'B-O', 4: 'B-ORG', 5: 'B-PER', 6: 'B-PRICE', 7: 'B-TYPE', 8: 'I-AREA', 9: 'I-ATTR', 10: 'I-LOC', 11: 'I-O', 12: 'I-ORG', 13: 'I-PER', 14: 'I-PRICE', 15: 'I-TYPE', 16: 'O'}


In [34]:
def decode_tags(y_ids_list, id2label):
    decoded_list = []
    for sent in y_ids_list:
        sent_decoded = [id2label[int(i)] for i in sent]
        decoded_list.append(sent_decoded)
    return decoded_list

RE

In [35]:
print("Creating pairs from Training Data...")
train_pairs = prepare_re_data_from_json(train_json)
test_pairs = prepare_re_data_from_json(test_json)

print(f"-> Total Train Pairs: {len(train_pairs)}")
print(f"-> Total Test Pairs: {len(test_pairs)}")

Creating pairs from Training Data...
-> Total Train Pairs: 277036
-> Total Test Pairs: 47950


In [None]:
TARGET_MINORITY = 10000
TARGET_MAJORITY = 35000
print(f"Target Configuration:")
print(f"   - Minority Classes (Real Relations): {TARGET_MINORITY} samples each")
print(f"   - Majority Class (NO_RELATION):      {TARGET_MAJORITY} samples")
print(f"   - Estimated Total Training Data:     ~{(4 * TARGET_MINORITY) + TARGET_MAJORITY} samples")

üéØ Target Configuration:
   - Minority Classes (Real Relations): 10000 samples each
   - Majority Class (NO_RELATION):      35000 samples
   - Estimated Total Training Data:     ~75000 samples


In [37]:
train_labels_list = y_train_re.tolist()
no_rel_id = max(set(train_labels_list), key=train_labels_list.count)
print(f"   -> NO_RELATION ID: {no_rel_id}")

   -> NO_RELATION ID: 4


In [38]:
rus_strategy = {no_rel_id: TARGET_MAJORITY} 
rus = RandomUnderSampler(sampling_strategy=rus_strategy, random_state=42)
X_under, y_under = rus.fit_resample(X_train_re, y_train_re)

In [39]:
scaler = StandardScaler()
X_under_scaled = scaler.fit_transform(X_under.astype('float32'))
X_test_scaled = scaler.transform(X_test_re.astype('float32'))

In [40]:
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train_re.astype('float32')) # Chuy·ªÉn float32 cho nh·∫π
# X_test_scaled = scaler.transform(X_test_re.astype('float32'))

In [41]:
# pca = PCA(n_components=0.95, random_state=42)
# X_train_pca = pca.fit_transform(X_train_scaled)
# X_test_pca = pca.transform(X_test_scaled)

In [42]:
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_under_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [43]:
# print(f"   -> Original shape: {X_train_re.shape}")
# print(f"   -> Reduced shape:  {X_train_pca.shape} (Dimensions: {pca.n_components_})")

In [44]:
print(f"   -> After PCA: {X_pca.shape}")

   -> After PCA: (45171, 221)


In [45]:
# del X_train_scaled, X_test_scaled
# gc.collect()
del X_under, X_under_scaled, X_train_re
gc.collect()

0

In [46]:
# print("\nStep 2: Applying SMOTE on reduced data...")
# smote = SMOTE(random_state=42, k_neighbors=5)
# X_resampled, y_resampled = smote.fit_resample(X_train_pca, y_train_re)
# print(f"Resampled shape: {X_resampled.shape}")

In [47]:
unique_classes = set(y_under)
smote_strategy = {
    cls: TARGET_MINORITY for cls in unique_classes if cls != no_rel_id
}
smote = SMOTE(sampling_strategy=smote_strategy, random_state=42, k_neighbors=5)
X_resampled, y_resampled = smote.fit_resample(X_pca, y_under)

print(f"   -> Final Training Data: {X_resampled.shape}")
print(f"   (M·ªói l·ªõp ƒë·ªÅu c√≥ {TARGET_MINORITY} m·∫´u. T·ªïng c·ªông ~{len(set(y_train_re)) * TARGET_MINORITY} m·∫´u)")

   -> Final Training Data: (75000, 221)
   (M·ªói l·ªõp ƒë·ªÅu c√≥ 10000 m·∫´u. T·ªïng c·ªông ~50000 m·∫´u)


In [48]:
# models_optimized = {
#     # SVM
#     'SVM': SVC(kernel='rbf', C=10.0, probability=True, random_state=42),
    
#     # MaxEnt
#     'MaxEnt': LogisticRegression(solver='lbfgs', max_iter=1000, C=10.0, random_state=42),
    
#     # RandomForest
#     'RandomForest': RandomForestClassifier(n_estimators=200, max_depth=20, n_jobs=-1, random_state=42)
# }

In [49]:
models_hybrid = {
    'SVM': SVC(kernel='rbf', C=10.0, probability=True, random_state=42),
    
    'MaxEnt': LogisticRegression(solver='lbfgs', max_iter=1000, C=10.0, random_state=42),
    
    'RandomForest': RandomForestClassifier(n_estimators=200, max_depth=20, n_jobs=-1, random_state=42)
}

In [50]:
print(f"{'='*30} START TRAINING {'='*30}")

trained_re_models = {}

for name, model in models_hybrid.items():
    print(f"\nTraining {name}...")
    
    model.fit(X_resampled, y_resampled)
    trained_re_models[name] = model
    
print("\nAll models trained")


Training SVM...

Training MaxEnt...

Training RandomForest...

All models trained


In [51]:
print(f"{'='*30} EVALUATION REPORT {'='*30}")

re_predictions = {}
X_test_scaled = scaler.transform(X_test_re.astype('float32'))
X_test_pca = pca.transform(X_test_scaled)
for name, model in trained_re_models.items():
    print(f"\n Evaluating {name}...")
    
    y_pred = model.predict(X_test_pca)
    re_predictions[name] = y_pred
    
    if 'no_rel_id' not in globals():
        train_labels_list = y_train_re.tolist()
        no_rel_id = max(set(train_labels_list), key=train_labels_list.count)
    
    acc = accuracy_score(y_test_re, y_pred)
    print(f"-> Overall Accuracy: {acc:.4f} ({acc*100:.2f}%)")
    labels_to_report = [i for i in set(y_test_re) if i != no_rel_id]
    target_names = [re_id2label[i] for i in labels_to_report]
    
    print(f"--- {name} ---")
    print(classification_report(
        y_test_re, y_pred, 
        labels=labels_to_report, 
        target_names=target_names, 
        digits=4,
        zero_division=0
    ))
    
    f1 = f1_score(y_test_re, y_pred, labels=labels_to_report, average='macro', zero_division=0)
    print(f"Macro F1: {f1:.4f}")


 Evaluating SVM...
-> Overall Accuracy: 0.6349 (63.49%)
--- SVM ---
              precision    recall  f1-score   support

    HAS_AREA     0.0149    0.4579    0.0289       273
    HAS_ATTR     0.1049    0.3700    0.1634       919
   HAS_PRICE     0.0173    0.4726    0.0335       146
  LOCATED_AT     0.0809    0.5320    0.1405       297

   micro avg     0.0394    0.4232    0.0721      1635
   macro avg     0.0545    0.4581    0.0916      1635
weighted avg     0.0777    0.4232    0.1252      1635

Macro F1: 0.0916

 Evaluating MaxEnt...
-> Overall Accuracy: 0.6299 (62.99%)
--- MaxEnt ---
              precision    recall  f1-score   support

    HAS_AREA     0.0331    0.4322    0.0615       273
    HAS_ATTR     0.0956    0.3232    0.1476       919
   HAS_PRICE     0.0131    0.6849    0.0257       146
  LOCATED_AT     0.0458    0.5488    0.0846       297

   micro avg     0.0380    0.4147    0.0695      1635
   macro avg     0.0469    0.4973    0.0798      1635
weighted avg     0.0688 

In [62]:
if 'RandomForest' in re_predictions:
    print("\n=== RANDOM FOREST DETAILED REPORT ===")
    
    # L·∫•y l·∫°i y_pred c·ªßa RF
    y_pred_rf = re_predictions['RandomForest']
    labels_to_report = [i for i in set(y_test_re) if i != no_rel_id]
    target_names = [re_id2label[i] for i in labels_to_report]
    
    print(classification_report(
        y_test_re, y_pred_rf, 
        labels=labels_to_report, 
        target_names=target_names, 
        digits=4, 
        zero_division=0
    ))
    f1 = f1_score(y_test_re, y_pred_rf, labels=labels_to_report, average='macro', zero_division=0)
    print(f"Macro F1: {f1:.4f}")


=== RANDOM FOREST DETAILED REPORT ===
              precision    recall  f1-score   support

    HAS_AREA     0.4000    0.0147    0.0283       273
    HAS_ATTR     0.4444    0.0087    0.0171       919
   HAS_PRICE     0.2000    0.0548    0.0860       146
  LOCATED_AT     0.3889    0.0471    0.0841       297

   micro avg     0.3269    0.0208    0.0391      1635
   macro avg     0.3583    0.0313    0.0539      1635
weighted avg     0.4051    0.0208    0.0373      1635

Macro F1: 0.0539


In [53]:
def visualize_re_results(test_pairs, y_true_ids, preds_dict, id2label, num_samples=10):
    print(f"\n{'='*30} RE MODEL COMPARISON SAMPLES {'='*30}")
    
    interesting_indices = [i for i, label_id in enumerate(y_true_ids) if label_id != no_rel_id]
    
    if len(interesting_indices) < num_samples:
        indices = random.sample(range(len(test_pairs)), num_samples)
    else:
        indices = random.sample(interesting_indices, num_samples)
    
    data = []
    for idx in indices:
        pair = test_pairs[idx]
        ent1 = pair['ent1']['text']
        ent2 = pair['ent2']['text']
        
        true_label = id2label[y_true_ids[idx]]
        
        row = {
            'C·∫∑p th·ª±c th·ªÉ': f"{ent1} ‚û° {ent2}",
            'Th·ª±c t·∫ø': true_label
        }
        
        for model_name, y_pred in preds_dict.items():
            pred_val = y_pred[idx]
            pred_label = id2label[pred_val]
            status = "‚úÖ" if pred_val == y_true_ids[idx] else "‚ùå"
            row[model_name] = f"{pred_label} {status}"
            
        data.append(row)
    
    df = pd.DataFrame(data)
    
    display(df)

visualize_re_results(test_pairs, y_test_re, re_predictions, re_id2label, num_samples=15)




Unnamed: 0,C·∫∑p th·ª±c th·ªÉ,Th·ª±c t·∫ø,SVM,MaxEnt,RandomForest
0,"ƒë·∫•t ‚û° 1,5 t·ª∑ VND",HAS_PRICE,HAS_PRICE ‚úÖ,HAS_PRICE ‚úÖ,NO_RELATION ‚ùå
1,bi·ªát th·ª± ‚û° Tr·∫ßn,LOCATED_AT,HAS_ATTR ‚ùå,NO_RELATION ‚ùå,NO_RELATION ‚ùå
2,Nh√† ‚û° giao th√¥ng thu·∫≠n ti·ªán,HAS_ATTR,NO_RELATION ‚ùå,NO_RELATION ‚ùå,NO_RELATION ‚ùå
3,ƒë·∫•t ‚û° Kh√°nh H√≤a,LOCATED_AT,LOCATED_AT ‚úÖ,LOCATED_AT ‚úÖ,NO_RELATION ‚ùå
4,ƒë·∫•t ‚û° s·ªï h·ªìng,HAS_ATTR,NO_RELATION ‚ùå,NO_RELATION ‚ùå,NO_RELATION ‚ùå
5,ƒê·∫•t ‚û° x√£ Kim Long,LOCATED_AT,LOCATED_AT ‚úÖ,LOCATED_AT ‚úÖ,NO_RELATION ‚ùå
6,ƒê·∫•t n·ªÅn ‚û° T√¢y Ninh,LOCATED_AT,NO_RELATION ‚ùå,LOCATED_AT ‚úÖ,NO_RELATION ‚ùå
7,n·ªÅn ‚û° tr·ª•c ƒê∆∞·ªùng T·ªânh 818,LOCATED_AT,LOCATED_AT ‚úÖ,HAS_ATTR ‚ùå,NO_RELATION ‚ùå
8,studio ‚û° 1t·ªâ850 1t·ªâ970,HAS_PRICE,NO_RELATION ‚ùå,HAS_PRICE ‚úÖ,NO_RELATION ‚ùå
9,Nh√† ‚û° 2pn,HAS_ATTR,HAS_AREA ‚ùå,NO_RELATION ‚ùå,NO_RELATION ‚ùå


In [54]:
# import joblib
# import json
# import os

# MODEL_DIR = '../models/'
# METADATA_DIR = '../models/metadata/'

# os.makedirs(os.path.join(MODEL_DIR, 'ner'), exist_ok=True)
# os.makedirs(os.path.join(MODEL_DIR, 're'), exist_ok=True)
# os.makedirs(METADATA_DIR, exist_ok=True)

# print(f"Saving artifacts to {MODEL_DIR}...")

# with open(os.path.join(METADATA_DIR, 'ner_id2label.json'), 'w', encoding='utf-8') as f:
#     json.dump(ner_id2label, f, ensure_ascii=False, indent=2)

# with open(os.path.join(METADATA_DIR, 're_id2label.json'), 'w', encoding='utf-8') as f:
#     json.dump(re_id2label, f, ensure_ascii=False, indent=2)

# print("Metadata (JSON) saved.")

# if 'trained_re_models' in globals():
#     for name, model in trained_re_models.items():
#         save_path = os.path.join(MODEL_DIR, 're', f're_{name.lower()}.joblib')
#         joblib.dump(model, save_path)
#         print(f"  -> Saved RE Model: {save_path}")
# else:
#     print("Warning: 'trained_re_models' not found. Did you run Block 6?")

# print("\nSaving NER Models (Retraining & Saving)...")
# ner_models_to_save = ['svm', 'maxent', 'random_forest']

# for model_name in ner_models_to_save:
#     model = FlatModelWrapper(model_name, config.get(model_name, {}))
#     model.train(X_resampled, y_resampled)
    
#     save_path = os.path.join(MODEL_DIR, 'ner', f'ner_{model_name.lower()}.joblib')
#     model.save(save_path)
#     print(f"  -> Saved NER Model: {save_path}")

# print(f"\nALL DONE! Models are ready in '{MODEL_DIR}'")

In [55]:
# import joblib
# import json

# # 1. Load Mapping
# with open('../models/metadata/re_id2label.json', 'r') as f:
#     re_id2label = json.load(f)
# re_id2label = {int(k): v for k, v in re_id2label.items()}

# # 2. Load Model
# loaded_svm_re = joblib.load('../models/re/re_svm.joblib')

# # 3. D·ª± ƒëo√°n (Input ph·∫£i l√† Vector PhoBERT 768 chi·ªÅu)
# # y_pred = loaded_svm_re.predict(X_new_vector)
# # label_name = re_id2label[y_pred[0]]