In [7]:
# #!pip uninstall numpy gensim -y
# !pip install numpy gensim --force-reinstall --no-cache-dir

In [3]:
from gensim.models import Word2Vec, FastText
import pandas as pd
import data_preprocessing as dp
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import re
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nuwai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nuwai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nuwai\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
MAX_LEN = 40

In [5]:
df = pd.read_csv('./data/MMNames_clean.csv')
df = dp.clean_name_column(df, 'name')

In [6]:
# Step 1: Identify ambiguous names (appear in multiple regions)
dupes = df.groupby("name")["SR_Name"].nunique()
ambiguous_names = dupes[dupes > 1].index

# Step 2: For ambiguous names, keep only the first occurrence
ambiguous_df = df[df["name"].isin(ambiguous_names)]
ambiguous_deduped = ambiguous_df.drop_duplicates(subset="name", keep="first")

# Step 3: For non-ambiguous names, just keep them as-is
non_ambiguous_df = df[~df["name"].isin(ambiguous_names)]

# Step 4: Combine them back together
df = pd.concat([non_ambiguous_df, ambiguous_deduped], ignore_index=True)

# Optional: Check final size
print(f"Original: {len(df)}, After deduplication: {len(df)}")

Original: 15087, After deduplication: 15087


In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(df['SR_Name'])

In [8]:

# Assume df['name'] contains romanized text
df['name'].dropna().astype(str).to_csv("names.txt", index=False, header=False)

In [9]:
# Tokenize each name into a list of characters or subwords (or words, depending on data)
df['tokens'] = df['name'].astype(str).apply(lambda x: x.split())  # word-level
# OR for character-level: list(x)
tokenized_data = df['tokens'].tolist()

In [10]:
import re

def normalize_burmese_phonics(text):
    text = text.lower().strip()

    # Mapping of Romanized Burmese phonics to standard forms
    phonics_map = {
        # Aspirated consonants → base form
        'ph': 'p',
        'hp': 'p',
        'hpy': 'py',
        'hs': 's',
        'th': 't',
        'ht': 't',
        'kh': 'k',
        'hk': 'k',
        'ng': 'n',
        'ny': 'n',
        'my': 'm',

        # Diphthongs and vowels
        'oo': 'u',
        'ou': 'u',
        'au': 'o',
        'aw': 'o',
        'ae': 'e',
        'ay': 'e',
        'ei': 'e',
        'ia': 'ya',   # ex: "Pyi A" or "Pya"
        'ua': 'wa',

        # Word endings or tones
        'aung': 'ong',
        'auk': 'ok',
        'ein': 'en',
        'yin': 'in',
        'yan': 'an',

        # Optional tone reduction
        'ya': 'a',
        'wa': 'a',
        'ra': 'a',

        # Silent or redundant
        'rr': 'r',
        'll': 'l',
        'pp': 'p',
        'tt': 't',
        'kk': 'k',
        'mm': 'm',
        'nn': 'n',
        'gg': 'g',
        'ss': 's',
    }

    # Apply rules based on length, avoiding overlap
    for k in sorted(phonics_map, key=lambda x: -len(x)):
        text = re.sub(k, phonics_map[k], text)

    # Remove unwanted characters
    text = re.sub(r'[^a-z\s]', '', text)

    # Normalize whitespace and repeated letters
    text = re.sub(r'(.)\1+', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [11]:
df['name'] = df['name'].apply(normalize_burmese_phonics)

In [12]:
df['name']

0              bogale
1             danubyu
2                dede
3                enme
4             hintada
             ...     
15082          nar ku
15083     tone bo gyi
15084    pan kar kone
15085         par kar
15086          an mai
Name: name, Length: 15087, dtype: object

In [13]:
import numpy as np
import pandas as pd
from gensim.models import FastText
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Bidirectional, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# === PARAMETERS ===
MAX_LEN = 30
EMBEDDING_DIM = 100
BATCH_SIZE = 32
EPOCHS = 20
TEST_SIZE = 0.3
RANDOM_STATE = 42

# === 1. Tokenize names into char lists for FastText ===
df['tokens'] = df['name'].astype(str).apply(lambda x: list(x.strip().lower()))
tokenized_data = df['tokens'].tolist()

# === 2. Train FastText model on all data (unsupervised embeddings) ===
fasttext_model = FastText(
    sentences=tokenized_data,
    vector_size=EMBEDDING_DIM,
    window=3,
    min_count=1,
    workers=4,
    sg=1  # skip-gram
)
fasttext_model.save("fasttext_gensim.model")

# === 3. Prepare texts for Keras Tokenizer (space-separated chars) ===
texts_for_keras = [' '.join(tokens) for tokens in tokenized_data]

# === 4. Split data into train and test BEFORE fitting tokenizer and label encoder to avoid leakage ===
X_train_texts, X_test_texts, y_train_raw, y_test_raw = train_test_split(
    texts_for_keras, df['SR_Name'], test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=df['SR_Name']
)

# === 5. Fit Keras Tokenizer ONLY on training data ===
tokenizer = Tokenizer(char_level=False, lower=True)  # words = chars separated by space
tokenizer.fit_on_texts(X_train_texts)

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# === 6. Encode labels with LabelEncoder fitted on train labels ONLY ===
label_encoder = LabelEncoder()
label_encoder.fit(y_train_raw)

y_train = label_encoder.transform(y_train_raw)
y_test = label_encoder.transform(y_test_raw)

# === 7. Create embedding matrix from FastText for tokenizer vocabulary ===
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in fasttext_model.wv:
        embedding_matrix[i] = fasttext_model.wv[word]
    else:
        embedding_matrix[i] = np.random.normal(size=(EMBEDDING_DIM,))

# === 8. Convert texts to padded sequences ===
X_train_seq = tokenizer.texts_to_sequences(X_train_texts)
X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')

X_test_seq = tokenizer.texts_to_sequences(X_test_texts)
X_test_padded = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')


In [14]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))
print(class_weights_dict)

{0: 0.4572616263964666, 1: 1.178045515394913, 2: 1.4593698175787728, 3: 1.3008130081300813, 4: 1.0628019323671498, 5: 3.963963963963964, 6: 1.7617617617617618, 7: 0.5155243116578794, 8: 0.64968623108158, 9: 1.8863879957127545, 10: 6.111111111111111, 11: 0.7770419426048565, 12: 0.3763096001710498, 13: 3.4108527131782944, 14: 0.6411657559198543, 15: 1.7777777777777777, 16: 1.8863879957127545, 17: 1.455748552522746}


In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization, SpatialDropout1D


def create_conv_lstm_model(vocab_size, max_len, num_classes,embedding_matrix):
    model = Sequential([
        Embedding(
            input_dim=vocab_size,
            output_dim=embedding_matrix.shape[1],
            weights=[embedding_matrix],
            input_length=max_len,
            trainable=False  # freeze embeddings, set True if you want to fine-tune
        ),
        SpatialDropout1D(0.2),
        Conv1D(64, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(2),
        Bidirectional(LSTM(64, return_sequences=True)),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

def create_wide_conv_model(vocab_size, max_len, num_classes, embedding_matrix): 
    model = Sequential([
        Embedding(
            input_dim=vocab_size,
            output_dim=embedding_matrix.shape[1],
            weights=[embedding_matrix],
            input_length=max_len,
            trainable=False  # freeze embeddings, set True if you want to fine-tune
        ),
        Conv1D(128, kernel_size=3, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(1024, activation='relu'),
        Dropout(0.3 ),
        Dense(512, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model  

num_classes = len(label_encoder.classes_)
model = create_conv_lstm_model(vocab_size, MAX_LEN, num_classes, embedding_matrix)
#model = create_wide_conv_model(vocab_size, MAX_LEN, num_classes, embedding_matrix)

# === 10. Setup callbacks ===
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

# === 11. Train model with validation on test set ===
history = model.fit(
    X_train_padded, y_train,
    epochs=100,
    batch_size=BATCH_SIZE,
    validation_data=(X_test_padded, y_test),  class_weight=class_weights_dict,
    #callbacks=[early_stop, checkpoint],
    verbose=1
)

# === 12. Evaluate on test set ===
loss, accuracy = model.evaluate(X_test_padded, y_test, verbose=1)
print(f"Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}")

Epoch 1/100




[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.0622 - loss: 2.8828 - val_accuracy: 0.0269 - val_loss: 2.8675
Epoch 2/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step - accuracy: 0.0732 - loss: 2.8294 - val_accuracy: 0.0870 - val_loss: 2.7647
Epoch 3/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - accuracy: 0.0860 - loss: 2.7441 - val_accuracy: 0.0568 - val_loss: 2.7394
Epoch 4/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.0988 - loss: 2.6838 - val_accuracy: 0.1582 - val_loss: 2.6516
Epoch 5/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.1175 - loss: 2.6141 - val_accuracy: 0.1456 - val_loss: 2.6490
Epoch 6/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.1181 - loss: 2.5988 - val_accuracy: 0.1127 - val_loss: 2.6309
Epoch 7/100
[1m330/330[0m 

In [48]:
# === 12. Evaluate on test set ===
loss, accuracy = model.evaluate(X_train_padded, y_train, verbose=1)
print(f"Test loss: {loss:.4f}, Train accuracy: {accuracy:.4f}")

[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9257 - loss: 0.2559
Test loss: 0.2578, Train accuracy: 0.9255


In [12]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score,
    top_k_accuracy_score
)
import seaborn as sns
import matplotlib.pyplot as plt

def evaluate_classification_model(model, X, y_true, output_path, prefix="test", batch_size=32, top_k=3, label_encoder=None):
    # Predict probabilities
    y_probs = model.predict(X, batch_size=batch_size, verbose=0)
    y_pred = np.argmax(y_probs, axis=1)

    # Accuracy
    acc = accuracy_score(y_true, y_pred)
    
    # Top-k Accuracy (optional)
    top_k_acc = top_k_accuracy_score(y_true, y_probs, k=top_k)

    # Classification Report
    report = classification_report(y_true, y_pred, output_dict=True,target_names=le.classes_)
    report_df = pd.DataFrame(report).round(2).transpose()
    report_df.loc["accuracy"] = acc
    report_df.loc[f"top_{top_k}_accuracy"] = top_k_acc

    # Save Report
    report_df.to_csv(f"{output_path}/cls_report_{prefix}.csv", index=True)

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    
    if label_encoder is not None:
        xticks = yticks = label_encoder.classes_
        label_map = dict(enumerate(label_encoder.classes_))
    else:
        xticks = yticks = np.arange(len(np.unique(y_true)))
        label_map = None
    print(f"label map: {label_map}")
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=xticks, yticklabels=yticks)
    plt.title(f"Confusion Matrix - {prefix}")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.savefig(f"{output_path}/confusion_matrix_{prefix}.png")
    plt.close()

    return {
        "accuracy": acc,
        f"top_{top_k}_accuracy": top_k_acc,
        "classification_report": report_df,
        "confusion_matrix": cm
    }

In [17]:
test_results = evaluate_classification_model(model, X_test_padded, y_test, './data', prefix="_norm_test_wideconv_fastext_epoch80")
train_results = evaluate_classification_model(model, X_train_padded, y_train, './data', prefix="norm_train_convlstm_fasttext_epoch80")
print(test_results['accuracy'])
print(train_results['accuracy'])

label map: None
label map: None
0.3565275016567263
0.9204545454545454
