### Pho bert

In [1]:
# Cell 1: Import & cấu hình thiết bị (GPU/FP16)

import os, sys, gc, math, pathlib, shutil
import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm

from transformers import AutoTokenizer, AutoModel

print("Python:", sys.version.split()[0])
print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

# Thiết bị & dtype (FP16 nếu có GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_fp16 = torch.cuda.is_available()
dtype = torch.float16 if use_fp16 else torch.float32
print("Dtype dùng:", dtype)

# Thư mục output
OUT_DIR = "outputs_phobert"
os.makedirs(OUT_DIR, exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


Python: 3.12.2
PyTorch: 2.8.0+cu128
CUDA available: True
GPU: NVIDIA L4
Dtype dùng: torch.float16


In [2]:
import pandas as pd

CSV_PATH = "/home/thinhbq/huy/social_networking/pho_bert/description_phobert(1).csv"  

df = pd.read_csv(CSV_PATH)

print("Cột:", df.columns.tolist())
print("Tổng dòng gốc:", len(df))

df["category_name"] = df["category_name"].fillna("unknown").astype(str)
df["description"]   = df["description"].fillna("").astype(str)

n_na        = df["description"].isna().sum()
n_empty     = (df["description"] == "").sum()
n_ws_only   = df["description"].str.strip().eq("").sum()

print(f"NaN: {n_na} | empty '': {n_empty} | whitespace-only: {n_ws_only}")

n_dups_desc = df.duplicated(subset=["description"]).sum()
print(f"Số dòng trùng mô tả (không tính dòng đầu tiên): {n_dups_desc}")

ex_ws = df[df["description"].str.strip().eq("")].head(5)
if len(ex_ws):
    print("\nVí dụ mô tả trắng/rỗng:")
    display(ex_ws)

df_keep = df[df["description"].str.strip().ne("")].reset_index(drop=True)
print("\nSố dòng sau khi loại trắng:", len(df_keep))

Cột: ['category_name', 'description']
Tổng dòng gốc: 36644
NaN: 0 | empty '': 0 | whitespace-only: 0
Số dòng trùng mô tả (không tính dòng đầu tiên): 3429

Số dòng sau khi loại trắng: 36644


In [3]:
import torch
from transformers import AutoTokenizer, AutoModel

MODEL_NAME = "vinai/phobert-large"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print("device:", device, "| dtype:", dtype)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModel.from_pretrained(MODEL_NAME)
model = model.to(device=device, dtype=dtype)
model.eval()


device: cuda | dtype: torch.float16


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(64001, 1024, padding_idx=1)
    (position_embeddings): Embedding(258, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      

### word-segmentation

In [4]:
import os
import py_vncorenlp

VNCORE_DIR = "/home/thinhbq/huy/social_networking/pho_bert/vncorenlp"
os.makedirs(VNCORE_DIR, exist_ok=True)
# py_vncorenlp.download_model(save_dir=VNCORE_DIR)

if "rdrsegmenter" not in globals():
    rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir=VNCORE_DIR)
    print("VnCoreNLP loaded.")
else:
    print("Reusing existing rdrsegmenter (already loaded).")

def vn_word_segment(text: str) -> str:
    if not text.strip():
        return ""
    sents = rdrsegmenter.word_segment(text)
    return " ".join(sents)

df_keep["description_wseg"] = [vn_word_segment(t) for t in df_keep["description"]]

print("Ví dụ sau khi word-seg:")
print(df_keep[["description", "description_wseg"]].head(3))

df_keep.to_parquet("description_phobert_wseg.parquet", index=False)

2025-10-11 17:55:12 INFO  WordSegmenter:24 - Loading Word Segmentation model
VnCoreNLP loaded.
Ví dụ sau khi word-seg:
                                         description  \
0  loa loa sản phẩm nguyên sản phẩm nguyên cam kế...   
1  đặc điểm nổi bật đặc điểm nổi bật công nghệ ch...   
2  thông số kỹ thuật loa 3 loa di động thương hiệ...   

                                    description_wseg  
0  loa loa sản_phẩm nguyên sản_phẩm nguyên cam_kế...  
1  đặc_điểm nổi_bật đặc_điểm nổi_bật công_nghệ ch...  
2  thông_số kỹ_thuật loa 3 loa di_động thương_hiệ...  


### embeding

In [5]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from contextlib import nullcontext

MAX_LEN = 256
BATCH_SIZE = 64

DATA_PATH = "description_phobert_wseg.parquet"
df_wseg = pd.read_parquet(DATA_PATH)
texts = df_wseg["description_wseg"].tolist()

class TextDataset(Dataset):
    def __init__(self, texts): self.texts = texts
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx): return self.texts[idx]

dataset = TextDataset(texts)

def collate_batch(batch_texts):
    return tokenizer(
        batch_texts,
        padding="max_length",      
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )

loader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    collate_fn=collate_batch,
    num_workers=2 if device.type == "cuda" else 0,
    pin_memory=(device.type == "cuda")
)

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state      
    mask = attention_mask.unsqueeze(-1).expand_as(token_embeddings).float()
    return (token_embeddings * mask).sum(1) / mask.sum(1).clamp(min=1e-9)

all_embeddings = []
model.eval()

amp_ctx = torch.autocast(device_type="cuda", dtype=dtype) if device.type=="cuda" else nullcontext()

with torch.inference_mode():
    for batch in tqdm(loader, desc="Encoding"):
        batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}
        with amp_ctx:
            outputs = model(**batch)
            emb = mean_pooling(outputs, batch["attention_mask"])
            emb = torch.nn.functional.normalize(emb, p=2, dim=1) 
        all_embeddings.append(emb.cpu().numpy())

embeddings = np.vstack(all_embeddings)
assert embeddings.shape[0] == len(df_wseg), "Số embedding không khớp số dòng metadata!"

np.save("/home/thinhbq/huy/social_networking/pho_bert/outputs_phobert/phobert_embeddings.npy", embeddings)
df_wseg.to_parquet("/home/thinhbq/huy/social_networking/pho_bert/outputs_phobert/metadata_phobert.parquet", index=False)

Encoding: 100%|██████████| 573/573 [04:55<00:00,  1.94it/s]


In [9]:
import numpy as np, pandas as pd
from pathlib import Path

OUT_DIR   = "/home/thinhbq/huy/social_networking/pho_bert/outputs_phobert"
EMB_PATH  = f"{OUT_DIR}/phobert_embeddings.npy"
META_PATH = f"{OUT_DIR}/metadata_phobert.parquet"

assert Path(EMB_PATH).exists(), EMB_PATH
assert Path(META_PATH).exists(), META_PATH

embeddings = np.load(EMB_PATH)                 
meta       = pd.read_parquet(META_PATH)        
assert embeddings.shape[0] == len(meta)

print("Embeddings:", embeddings.shape)
print("Meta:", meta.shape)

Embeddings: (36644, 1024)
Meta: (36644, 3)


### Kmean

#### Grid search tìm K

In [10]:
import numpy as np, pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score

EMB_PATH = "/home/thinhbq/huy/social_networking/pho_bert/outputs_phobert/phobert_embeddings.npy"
Ks = [4, 8, 12, 16, 20, 24, 28, 30, 40, 50]

X  = np.load(EMB_PATH)
Xn = normalize(X)

rows = []
for K in Ks:
    labels = KMeans(n_clusters=K, n_init="auto", random_state=42).fit_predict(Xn)
    sil = float(silhouette_score(Xn, labels, metric="cosine"))
    rows.append((K, sil))

df_k = pd.DataFrame(rows, columns=["K","silhouette_cosine"]).sort_values("silhouette_cosine", ascending=False)
best_K = int(df_k.iloc[0]["K"])
print(df_k.to_string(index=False))
print("best_K:", best_K)

 K  silhouette_cosine
12           0.204663
16           0.194674
20           0.192583
 4           0.190894
 8           0.177111
30           0.175496
28           0.175351
24           0.173827
40           0.166742
50           0.160448
best_K: 12


#### áp dụng Kmean với K tìm được

In [12]:
from sklearn.cluster import KMeans
import numpy as np

km = KMeans(n_clusters=best_K, n_init="auto", random_state=42).fit(Xn)
labels = km.labels_

uniq, cnt = np.unique(labels, return_counts=True)
print("best_K:", best_K, "| n_clusters:", len(uniq))
for k, c in zip(uniq, cnt):
    print(f"cluster {k}: {c}")

best_K: 12 | n_clusters: 12
cluster 0: 1757
cluster 1: 1418
cluster 2: 2986
cluster 3: 2504
cluster 4: 3488
cluster 5: 2251
cluster 6: 4104
cluster 7: 3327
cluster 8: 4298
cluster 9: 1842
cluster 10: 2939
cluster 11: 5730


#### đánh giá trong trường hợp không có nhãn

In [None]:
import pandas as pd
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import os

EVAL_PATH = "/home/thinhbq/huy/social_networking/pho_bert/clustering/evaluation_phobert.csv"

sil = float(silhouette_score(Xn, labels, metric="cosine"))
dbi = float(davies_bouldin_score(Xn, labels))
ch  = float(calinski_harabasz_score(Xn, labels))

print(f"[Unsupervised] Silhouette(cosine)={sil:.4f} | DBI={dbi:.4f} | CH={ch:.2f}")

record = pd.DataFrame([{
    "model": "KMeans",
    "metric": "cosine",
    "K": best_K,
    "type": "unsupervised",
    "silhouette_cosine": sil,
    "DBI": dbi,
    "CH": ch,
    "NMI": None,
    "ARI": None,
    "Purity": None
}])

if os.path.exists(EVAL_PATH):
    old = pd.read_csv(EVAL_PATH)
    df_out = pd.concat([old, record], ignore_index=True)
else:
    df_out = record

df_out.to_csv(EVAL_PATH, index=False)

[Unsupervised] Silhouette(cosine)=0.2047 | DBI=2.2994 | CH=2418.46


#### đánh giá trong trường hợp có nhãn

In [16]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics.cluster import contingency_matrix
import os

META_PATH = "/home/thinhbq/huy/social_networking/pho_bert/outputs_phobert/metadata_phobert.parquet"
EVAL_PATH = "/home/thinhbq/huy/social_networking/pho_bert/clustering/evaluation_phobert.csv"

df_meta = pd.read_parquet(META_PATH)
y_true = LabelEncoder().fit_transform(df_meta["category_name"].astype(str).values)

def purity_score(y_true, y_pred):
    cm = contingency_matrix(y_true, y_pred)
    return cm.max(axis=0).sum() / cm.sum()

nmi = float(normalized_mutual_info_score(y_true, labels, average_method="arithmetic"))
ari = float(adjusted_rand_score(y_true, labels))
pur = float(purity_score(y_true, labels))

print(f"[Supervised] NMI={nmi:.4f} | ARI={ari:.4f} | Purity={pur:.4f}")

record = pd.DataFrame([{
    "model": "KMeans",
    "metric": "cosine",
    "K": best_K,
    "type": "supervised",
    "silhouette_cosine": None,
    "DBI": None,
    "CH": None,
    "NMI": nmi,
    "ARI": ari,
    "Purity": pur
}])

if os.path.exists(EVAL_PATH):
    old = pd.read_csv(EVAL_PATH)
    df_out = pd.concat([old, record], ignore_index=True)
else:
    df_out = record

df_out.to_csv(EVAL_PATH, index=False)

[Supervised] NMI=0.5477 | ARI=0.3781 | Purity=0.5329


  df_out = pd.concat([old, record], ignore_index=True)


### DBSCAN

#### Grid search

In [5]:
import numpy as np, pandas as pd
from sklearn.preprocessing import normalize
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

EMB_PATH = "/home/thinhbq/huy/social_networking/pho_bert/outputs_phobert/phobert_embeddings.npy"

EPS_COARSE = [0.003, 0.005, 0.007, 0.009, 0.011, 0.013]
MS_LIST = [5, 8, 10, 12, 15]

X  = np.load(EMB_PATH)
Xn = normalize(X)

def eval_dbscan(eps_list, ms_list):
    rows = []
    for eps in eps_list:
        for ms in ms_list:
            labels = DBSCAN(eps=eps, min_samples=ms, metric="cosine", n_jobs=-1).fit_predict(Xn)
            mask = labels != -1
            n_noise = int((~mask).sum())
            n_keep  = int(mask.sum())
            n_clu   = len(np.unique(labels[mask])) if n_keep > 0 else 0
            if n_clu >= 2:
                sil = float(silhouette_score(Xn[mask], labels[mask], metric="cosine"))
                dbi = float(davies_bouldin_score(Xn[mask], labels[mask]))
                ch  = float(calinski_harabasz_score(Xn[mask], labels[mask]))
            else:
                sil = dbi = ch = np.nan
            rows.append({
                "eps": float(eps), "min_samples": int(ms),
                "n_clusters": int(n_clu),
                "noise_frac": n_noise / Xn.shape[0],
                "silhouette_cosine": sil, "DBI": dbi, "CH": ch
            })
    return pd.DataFrame(rows)

df1 = eval_dbscan(EPS_COARSE, MS_LIST)
_df1 = df1.copy()
_df1["_DBI_sort"] = _df1["DBI"].fillna(np.inf)
_df1 = _df1.sort_values(
    by=["silhouette_cosine","_DBI_sort","CH","noise_frac"],
    ascending=[False, True, False, True]
)
top1 = _df1.iloc[0]
best_eps_coarse = float(top1["eps"])

ref_lo = max(0.001, best_eps_coarse - 0.002)
ref_hi = best_eps_coarse + 0.002
EPS_FINE = np.round(np.arange(ref_lo, ref_hi + 1e-9, 0.001), 3)


df2 = eval_dbscan(EPS_FINE, MS_LIST)
df_all = pd.concat([df1, df2], ignore_index=True).drop_duplicates(subset=["eps","min_samples"])

_df = df_all.copy()
_df["_DBI_sort"] = _df["DBI"].fillna(np.inf)
_df = _df.sort_values(
    by=["silhouette_cosine","_DBI_sort","CH","noise_frac"],
    ascending=[False, True, False, True]
)

best = _df.iloc[0]
best_eps = float(best["eps"])
best_ms  = int(best["min_samples"])

print(_df.drop(columns=["_DBI_sort"]).head(15).to_string(index=False))
print(f"\n Chọn: eps={best_eps}, min_samples={best_ms}")

  eps  min_samples  n_clusters  noise_frac  silhouette_cosine      DBI        CH
0.007           15           2    0.020358           0.555812 0.837081 48.361832
0.007           12           2    0.019348           0.555514 0.837299 48.325653
0.007           10           2    0.018912           0.555455 0.837356 48.317505
0.007            8           2    0.018339           0.555242 0.837513 48.293240
0.013            5           3    0.000655           0.526894 1.023394 38.960388
0.013            8           2    0.000791           0.526874 1.194343 55.853313
0.013           10           2    0.000791           0.526874 1.194343 55.853313
0.013           12           2    0.000928           0.514477 1.179943 50.203739
0.013           15           2    0.000928           0.514477 1.179943 50.203739
0.009           10           2    0.004557           0.470796 1.123968 33.887119
0.009            8           2    0.004475           0.470770 1.123992 33.884640
0.011            8          

In [4]:
for eps in np.arange(0.005, 0.05, 0.005):
    labels = DBSCAN(eps=eps, min_samples=10, metric='cosine', n_jobs=-1).fit_predict(Xn)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    noise_frac = (labels == -1).mean()
    print(f"eps={eps:.3f} → n_clusters={n_clusters}, noise={noise_frac:.1%}")


eps=0.005 → n_clusters=17, noise=11.2%
eps=0.010 → n_clusters=2, noise=0.3%
eps=0.015 → n_clusters=1, noise=0.1%
eps=0.020 → n_clusters=1, noise=0.0%
eps=0.025 → n_clusters=1, noise=0.0%
eps=0.030 → n_clusters=1, noise=0.0%
eps=0.035 → n_clusters=1, noise=0.0%
eps=0.040 → n_clusters=1, noise=0.0%
eps=0.045 → n_clusters=1, noise=0.0%


#### Áp dụng với DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import normalize
import numpy as np


db = DBSCAN(eps=best_eps, min_samples=best_ms, metric="cosine", n_jobs=-1).fit(Xn)
labels = db.labels_

noise = int((labels == -1).sum())
uniq, cnt = np.unique(labels[labels != -1], return_counts=True)
print(f"eps={best_eps}, min_samples={best_ms}")
print(f"clusters={len(uniq)} | noise={noise} / {len(labels)} ({noise/len(labels):.2%})")
for k, c in zip(uniq, cnt):
    print(f"cluster {k}: {c}")

#### đánh giá ( trong trường hợp không biết nhãn)

In [None]:
# DBSCAN (COSINE) — Cell 3: unsupervised eval + append CSV
import pandas as pd, os
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

EVAL_PATH = "/home/thinhbq/huy/social_networking/pho_bert/clustering/evaluation_phobert.csv"

mask = (labels != -1)
if mask.sum() >= 2 and len(np.unique(labels[mask])) >= 2:
    sil = float(silhouette_score(Xn[mask], labels[mask], metric="cosine"))
    dbi = float(davies_bouldin_score(Xn[mask], labels[mask]))
    ch  = float(calinski_harabasz_score(Xn[mask], labels[mask]))
else:
    sil = dbi = ch = float("nan")

print(f"[DBSCAN-cosine | unsupervised] sil={sil:.4f} | DBI={dbi} | CH={ch}")

record = pd.DataFrame([{
    "model": "DBSCAN",
    "metric": "cosine",
    "K": "",            
    "type": "unsupervised",
    "silhouette_cosine": sil,
    "DBI": dbi,
    "CH": ch,
    "NMI": None,
    "ARI": None,
    "Purity": None
}])

if os.path.exists(EVAL_PATH):
    old = pd.read_csv(EVAL_PATH)
    df_out = pd.concat([old, record], ignore_index=True)
else:
    df_out = record

df_out.to_csv(EVAL_PATH, index=False)


#### đánh giá ( trong trường hợp biết nhãn)

In [None]:
import pandas as pd, os
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics.cluster import contingency_matrix

META_PATH = "/home/thinhbq/huy/social_networking/pho_bert/outputs_phobert/metadata_phobert.parquet"
EVAL_PATH = "/home/thinhbq/huy/social_networking/pho_bert/clustering/evaluation_phobert.csv"

df_meta = pd.read_parquet(META_PATH)
y_true = LabelEncoder().fit_transform(df_meta["category_name"].astype(str).values)

def purity_score(y_true, y_pred):
    cm = contingency_matrix(y_true, y_pred)
    return cm.max(axis=0).sum() / cm.sum()

nmi = float(normalized_mutual_info_score(y_true, labels, average_method="arithmetic"))
ari = float(adjusted_rand_score(y_true, labels))
pur = float(purity_score(y_true, labels))

print(f"[DBSCAN-cosine | supervised] NMI={nmi:.4f} | ARI={ari:.4f} | Purity={pur:.4f}")

record = pd.DataFrame([{
    "model": "DBSCAN",
    "metric": "cosine",
    "K": "",
    "type": "supervised",
    "silhouette_cosine": None,
    "DBI": None,
    "CH": None,
    "NMI": nmi,
    "ARI": ari,
    "Purity": pur
}])

if os.path.exists(EVAL_PATH):
    old = pd.read_csv(EVAL_PATH)
    df_out = pd.concat([old, record], ignore_index=True)
else:
    df_out = record

df_out.to_csv(EVAL_PATH, index=False)

### GNN

#### Grid search

In [1]:
import numpy as np, pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score

EMB_PATH = "/Users/huy/Documents/Hutech/HK1A 2025-2026/social-networking/social_networking/pho_bert/outputs_phobert/phobert_embeddings.npy"
Ks = [4, 8, 12, 14, 16, 20, 24, 28, 30]

X = np.load(EMB_PATH).astype(np.float64)
Xn = normalize(X)

rows = []
for K in Ks:
    gmm = GaussianMixture(n_components=K, covariance_type="diag", reg_covar=1e-4, random_state=42)
    labels = gmm.fit_predict(Xn)
    sil = float(silhouette_score(Xn, labels, metric="cosine"))
    rows.append((K, sil))

df_k = pd.DataFrame(rows, columns=["K", "silhouette_cosine"]).sort_values("silhouette_cosine", ascending=False)
best_K = int(df_k.iloc[0]["K"])
print(df_k.to_string(index=False))
print("best_K:", best_K)


 K  silhouette_cosine
 4           0.263736
12           0.205098
28           0.199713
20           0.196263
16           0.195344
30           0.193094
14           0.183494
 8           0.179954
24           0.179773
best_K: 4


#### Áp dụng với K tìm được 

In [2]:
from sklearn.mixture import GaussianMixture
import numpy as np

gmm = GaussianMixture(n_components=best_K, covariance_type="diag", reg_covar=1e-4, random_state=42)
labels = gmm.fit_predict(Xn)

uniq, cnt = np.unique(labels, return_counts=True)
print("best_K:", best_K, "| n_clusters:", len(uniq))
for k, c in zip(uniq, cnt):
    print(f"cluster {k}: {c}")


best_K: 4 | n_clusters: 4
cluster 0: 13975
cluster 1: 14175
cluster 2: 4945
cluster 3: 3549


#### đánh giá không nhãn

In [None]:
import pandas as pd, os
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

EVAL_PATH = "/home/thinhbq/huy/social_networking/pho_bert/clustering/evaluation_phobert.csv"

sil = float(silhouette_score(Xn, labels, metric="cosine"))
dbi = float(davies_bouldin_score(Xn, labels))
ch = float(calinski_harabasz_score(Xn, labels))

print(f"[Unsupervised] Silhouette(cosine)={sil:.4f} | DBI={dbi:.4f} | CH={ch:.2f}")

record = pd.DataFrame([{
    "model": "GMM",
    "metric": "cosine",
    "K": best_K,
    "type": "unsupervised",
    "silhouette_cosine": sil,
    "DBI": dbi,
    "CH": ch,
    "NMI": None,
    "ARI": None,
    "Purity": None
}])

if os.path.exists(EVAL_PATH):
    old = pd.read_csv(EVAL_PATH)
    df_out = pd.concat([old, record], ignore_index=True)
else:
    df_out = record

df_out.to_csv(EVAL_PATH, index=False)


#### đánh giá khi biết nhãn

In [None]:
import pandas as pd, os
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics.cluster import contingency_matrix

META_PATH = "/home/thinhbq/huy/social_networking/pho_bert/outputs_phobert/metadata_phobert.parquet"
EVAL_PATH = "/home/thinhbq/huy/social_networking/pho_bert/clustering/evaluation_phobert.csv"

df_meta = pd.read_parquet(META_PATH)
y_true = LabelEncoder().fit_transform(df_meta["category_name"].astype(str).values)

def purity_score(y_true, y_pred):
    cm = contingency_matrix(y_true, y_pred)
    return cm.max(axis=0).sum() / cm.sum()

nmi = float(normalized_mutual_info_score(y_true, labels, average_method="arithmetic"))
ari = float(adjusted_rand_score(y_true, labels))
pur = float(purity_score(y_true, labels))

print(f"[Supervised] NMI={nmi:.4f} | ARI={ari:.4f} | Purity={pur:.4f}")

record = pd.DataFrame([{
    "model": "GMM",
    "metric": "cosine",
    "K": best_K,
    "type": "supervised",
    "silhouette_cosine": None,
    "DBI": None,
    "CH": None,
    "NMI": nmi,
    "ARI": ari,
    "Purity": pur
}])

if os.path.exists(EVAL_PATH):
    old = pd.read_csv(EVAL_PATH)
    df_out = pd.concat([old, record], ignore_index=True)
else:
    df_out = record

df_out.to_csv(EVAL_PATH, index=False)


## multilingual

### Kmean

#### grid search