In [None]:
# get pretrained FastText embeddings using own courpus
from gensim.models import FastText

def load_conll_tokens(file_path):
    sentences = []
    current_sentence = []
    with open(file_path, encoding="utf-8") as f:
        for line in f:
            if line.strip():
                token = line.strip().split()[0]  # Take token only
                current_sentence.append(token)
            else:
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
    if current_sentence:
        sentences.append(current_sentence)
    return sentences

# Load tokenized sentences from your .conll
tokenized_sentences = load_conll_tokens("../datasets/3entity_annotated_ner_cleaned.conll")



# Train FastText model
# ft_model = FastText(
#     sentences=tokenized_sentences,
#     vector_size=300,
#     window=5,
#     min_count=1,
#     workers=4
# )

# # Save the model
# ft_model.save("fasttext_gensim.model")

# # Example: Get vector for a token
# print(ft_model.wv["မြန်မာ"])


In [10]:
# Flatten the list of sentences into a single list of words
all_tokens = [word for sent in tokenized_sentences for word in sent]
print(f"Total sentences: {len(tokenized_sentences)}")

print(f"Total tokens in corpus: {len(all_tokens)}")
# Count unique words
unique_words = set(all_tokens)

print(f"Unique words in corpus: {len(unique_words)}")
#print(f"Vocab size in FastText: {len(ft_model.wv)}")


Total sentences: 71711
Total tokens in corpus: 2620226
Unique words in corpus: 25505


In [7]:
# ---------- Load FastText model ----------
ft_model = FastText.load("fasttext_gensim.model")
print(f"Vocab size: {len(ft_model.wv)}")
print(f"Vector dimension: {ft_model.wv.vector_size}")


Vocab size: 25505
Vector dimension: 300


In [None]:
import re
import numpy as np
from gensim.models import FastText
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split
import time

def load_conll(file_path):
    """
    Load a CoNLL file and return a list of sentences for NER.

    Args:
        file_path (str): Path to the CoNLL file.

    Returns:
        List[List[Tuple[str, str, str]]]: List of sentences, each sentence is a list of (token, pos, ner) tuples.
    """
    sentences, sentence = [], []
    with open(file_path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split()
                if len(parts) >= 3:
                    token, pos, ner = parts[0], parts[1], parts[2]
                else:
                    token, pos, ner = parts[0], "X", "O"
                sentence.append((token, pos, ner))
            else:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
    if sentence:
        sentences.append(sentence)
    return sentences

def is_numeric(token):
    """
    Check if a token is numeric (supports Burmese and Western digits).

    Args:
        token (str): The token to check.

    Returns:
        bool: True if the token is numeric, False otherwise.
    """
    burmese_digits = "၁၂၃၄၅၆၇၈၉၀"
    return token.isdigit() or all(c in burmese_digits for c in token)

def is_english(token):
    """
    Check if a token consists only of English letters.

    Args:
        token (str): The token to check.

    Returns:
        bool: True if the token is English, False otherwise.
    """
    return bool(re.match(r"^[A-Za-z]+$", token))

def word2features(sentence, index, ft_model):
    """
    Extract features for a token in a sentence, including FastText embedding average.

    Args:
        sentence (List[Tuple[str, str, str]]): List of (token, pos, ner) tuples.
        index (int): Index of the token in the sentence.
        ft_model (FastText): Pretrained FastText model.

    Returns:
        dict: Dictionary of extracted features for the token.
    """
    token = sentence[index][0]
    features = {
        'word': token,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prefix-1': token[0],
        'prefix-2': token[:2],
        'prefix-3': token[:3],
        'prefix-4': token[:4],
        'prefix-5': token[:5],
        'suffix-1': token[-1],
        'suffix-2': token[-2:],
        'suffix-3': token[-3:],
        'suffix-4': token[-4:],
        'suffix-5': token[-5:],
        'prev_word': '' if index == 0 else sentence[index - 1][0],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1][0],
        'has_hyphen': '-' in token,
        'is_numeric': is_numeric(token),
        'is_english': is_english(token),
    }
    # Add FastText average embedding value
    if token in ft_model.wv:
        vec = ft_model.wv[token]  # vector for this token, to save time and to convert 1D scaler 
        features['ft_avg'] = float(np.mean(vec)) # average value of its dimensions
    else:
        features['ft_avg'] = 0.0
    return features

def sent2features(sentence, ft_model):
    """
    Convert a sentence to a list of feature dicts for each token.

    Args:
        sentence (List[Tuple[str, str, str]]): List of (token, pos, ner) tuples.
        ft_model (FastText): Pretrained FastText model.

    Returns:
        List[dict]: List of feature dicts for each token.
        [ {feature_dict_of_word1}, {feature_dict_of_word2}, ... ]
    """
    return [word2features(sentence, i, ft_model) for i in range(len(sentence))]

def sent2labels(sentence):
    """
    Extract NER labels from a sentence.

    Args:
        sentence (List[Tuple[str, str, str]]): List of (token, pos, ner) tuples.

    Returns:
        List[str]: List of NER labels for the sentence.
    """
    return [ner for _, _, ner in sentence]

# ---------- Load FastText model ----------
ft_model = FastText.load("fasttext_gensim.model")

# ---------- Load data ----------
file_path = "../datasets/3entity_annotated_ner_cleaned.conll"
data = load_conll(file_path)

# ---------- Convert data to features & labels ----------
X = [sent2features(s, ft_model) for s in data]
y = [sent2labels(s) for s in data]

# ---------- Split train/test ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---------- Train CRF ----------
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
start_time = time.time()
crf.fit(X_train, y_train)
print(f"Training completed in {time.time() - start_time:.2f} seconds")

# ---------- Predict & report ----------
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, digits=3))


Training completed in 700.60 seconds
              precision    recall  f1-score   support

      B-DATE      0.946     0.948     0.947      2609
       B-LOC      0.918     0.862     0.889     11068
      B-TIME      0.952     0.924     0.938       595
      I-DATE      0.957     0.951     0.954      4205
       I-LOC      0.838     0.764     0.800      8097
      I-TIME      0.945     0.925     0.935       729
           O      0.993     0.996     0.994    499009

    accuracy                          0.989    526312
   macro avg      0.936     0.910     0.922    526312
weighted avg      0.988     0.989     0.988    526312



In [5]:
import pickle

# Save the CRF model
with open("fasttext_crf_ner_model.pkl", "wb") as f:
    pickle.dump(crf, f)

print("CRF model saved successfully!")

CRF model saved successfully!


In [3]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
# ---------- Base CRF ----------
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

# ---------- Hyperparameter tuning ----------
params_space = {
    'c1': np.logspace(-3, 0, 10),  # L1
    'c2': np.logspace(-3, 0, 10),  # L2
}

# Use weighted F1 score
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_iter=10,
                        scoring='f1_weighted',
                        n_jobs=-1,
                        random_state=42)
rs.fit(X_train, y_train)

print("Best parameters:", rs.best_params_)
best_crf = rs.best_estimator_

# ---------- Predict & evaluate ----------
y_pred = best_crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, digits=3))

# ---------- Save model ----------
import pickle
with open("crf_fasttext_tuned.pkl", "wb") as f:
    pickle.dump(best_crf, f)
print("Best CRF model saved to crf_fasttext_tuned.pkl")

Fitting 3 folds for each of 10 candidates, totalling 30 fits




Best parameters: {'c2': 0.01, 'c1': 0.46415888336127775}
              precision    recall  f1-score   support

      B-DATE      0.949     0.946     0.947      5410
       B-LOC      0.915     0.856     0.884     22067
      B-TIME      0.943     0.913     0.928      1151
      I-DATE      0.959     0.950     0.954      8733
       I-LOC      0.830     0.764     0.796     16252
      I-TIME      0.937     0.910     0.924      1406
           O      0.993     0.996     0.994    998005

    accuracy                          0.988   1053024
   macro avg      0.932     0.905     0.918   1053024
weighted avg      0.988     0.988     0.988   1053024

Best CRF model saved to crf_fasttext_tuned.pkl
