In [2]:
import re

def is_numeric(token):
    """
    Check if a token is numeric (supports Burmese and Western digits).

    Args:
        token (str): The token to check.

    Returns:
        bool: True if the token is numeric, False otherwise.
    """
    burmese_digits = set("၁၂၃၄၅၆၇၈၉၀")
    return token.isdigit() or all(char in burmese_digits for char in token)

def is_english(token):
    """
    Check if a token consists only of English letters.

    Args:
        token (str): The token to check.

    Returns:
        bool: True if the token is English, False otherwise.
    """
    return bool(re.match(r"^[A-Za-z]+$", token))

def extract_features(sentence, index):
    """
    Extract features for a token in a sentence for NER classification.

    Args:
        sentence (List[Tuple[str, str, str]]): List of (token, pos, ner) tuples.
        index (int): Index of the token in the sentence.

    Returns:
        dict: Dictionary of extracted features for the token.
    """
    token = sentence[index][0]
    features = {
        'word': token,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prefix-1': token[0],
        'prefix-2': token[:2],
        'prefix-3': token[:3],
        'prefix-4': token[:4],
        'prefix-5': token[:5],
        'suffix-1': token[-1],
        'suffix-2': token[-2:],
        'suffix-3': token[-3:],
        'suffix-4': token[-4:],
        'suffix-5': token[-5:],
        'prev_word': '' if index == 0 else sentence[index - 1][0],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1][0],
        'has_hyphen': '-' in token,
        'is_numeric': is_numeric(token),
        'is_english': is_english(token)
    }
    return features

# ---------- Convert token features to single string ----------
def token_to_string(features):
    """
    Combine token and features into a single string for TF-IDF
    """
    s = features['word']
    s += ' pre1_' + features['prefix-1']
    s += ' pre2_' + features['prefix-2']
    s += ' pre3_' + features['prefix-3']
    s += ' suf1_' + features['suffix-1']
    s += ' suf2_' + features['suffix-2']
    s += ' suf3_' + features['suffix-3']
    s += ' prev_' + features['prev_word']
    s += ' next_' + features['next_word']
    s += ' hyphen_' + str(features['has_hyphen'])
    s += ' numeric_' + str(features['is_numeric'])
    s += ' english_' + str(features['is_english'])
    return s

# ---------- Prepare data for TF-IDF + NB ----------
def prepare_data(conll_data):
    """
    conll_data: list of sentences, each sentence = list of tuples (token, pos, ner)
    Returns:
        X_strings: list of token feature strings
        y_labels: list of NER labels
    """
    X_strings = []
    y_labels = []

    for sentence in conll_data:
        for i in range(len(sentence)):
            feats = extract_features(sentence, i)
            X_strings.append(token_to_string(feats))
            y_labels.append(sentence[i][2])  # NER label

    return X_strings, y_labels


In [3]:
def load_conll(file_path):
    """
    Load a .conll file and return a list of sentences.
    Each sentence is a list of tuples: (token, pos, ner)
    
    Args:
        file_path (str): path to .conll file
    
    Returns:
        List[List[Tuple[str, str, str]]]: list of sentences
    """
    sentences = []
    sentence = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                parts = line.split('\t')
                if len(parts) >= 3:
                    token, pos, ner = parts[0], parts[1], parts[2]
                    sentence.append((token, pos, ner))
                else:
                    # Handle lines with missing columns
                    token = parts[0]
                    pos = parts[1] if len(parts) > 1 else 'X'
                    ner = parts[2] if len(parts) > 2 else 'O'
                    sentence.append((token, pos, ner))
    # Append last sentence if file does not end with blank line
    if sentence:
        sentences.append(sentence)

    return sentences


In [4]:

input_dir = "../datasets/3entity_annotated_ner_cleaned.conll"
sentences = load_conll(input_dir)  # list of sentences, each sentence = [(token,pos,ner), ...]

# Prepare data
X_strings, y_labels = prepare_data(sentences)

# TF-IDF + Naive Bayes
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import time

X_train, X_test, y_train, y_test = train_test_split(X_strings, y_labels, test_size=0.4, random_state=42,stratify=y_labels)

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), lowercase=False)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

nb = MultinomialNB()
start_time = time.time()
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_test_tfidf)
end_time = time.time()

print(classification_report(y_test, y_pred, zero_division=0))
print(f"Training and prediction took {end_time - start_time:.4f} seconds")


              precision    recall  f1-score   support

      B-DATE       0.72      0.55      0.62      5317
       B-LOC       0.63      0.33      0.43     21967
      B-TIME       0.96      0.26      0.41      1159
      I-DATE       0.51      0.63      0.56      8583
       I-LOC       0.43      0.17      0.24     16053
      I-TIME       0.94      0.03      0.06      1472
           O       0.97      0.99      0.98    993540

    accuracy                           0.95   1048091
   macro avg       0.74      0.42      0.47   1048091
weighted avg       0.95      0.95      0.95   1048091

Training and prediction took 5.1861 seconds
