# 1. Data Understanding

In [1]:
import pandas as pd

# Read the CSV file into a DataFrame, using the first row as column names
dataset = pd.read_csv('/kaggle/input/burmese-name-with-gender/Burmese Name with Gender.csv')

# Display the DataFramedataset
dataset

Unnamed: 0,Name,Gender
0,ကျင်ဒါမဏီ,1
1,ကျော့ကျော့စံ,1
2,ကျော့ကေခိုင်,1
3,ကျော့ကေခိုင်စု,1
4,ကျော့ကေသရီ,1
...,...,...
4326,ဥမ္မာခိုင်,1
4327,ဥမ္မာနွေး,1
4328,ဥသာဖြိုး,0
4329,ဧကရီထွန်း,1


## 2. Data Preprocessing

In [2]:
dataset = dataset.replace({r'\n': ' ', r'\r': ' '}, regex=True)

In [3]:
dataset = dataset.sample(frac=1)
dataset

Unnamed: 0,Name,Gender
1843,နှင်းနှင်းဦး,1
3764,ဟိန်းစည်,0
1933,နိုင်ဝင်းအေး,0
652,ခိုင်မူယာလင်း,1
2832,ရတနာချို,1
...,...,...
1103,ဆုလဲ့ဖြူ,1
273,ကောင်းနိုင်ခ,0
2485,မြတ်သုခ,0
1850,နှင်းပွင့်ဖြူ,1


In [4]:
import re

def segment(text):
    # Use regular expressions to segment the input text
    text = re.sub(r'(?:(?<!္)([က-ဪဿ၊-၏]|[၀-၉]+|[^က-၏]+)(?![ှျ]?[့္်]))', r' \1', text)
    return text

# Apply segmentation to each name in the DataFrame
dataset['Name'] = dataset['Name'].apply(segment)

# Display the DataFrame
dataset

Unnamed: 0,Name,Gender
1843,နှင်း နှင်း ဦး,1
3764,ဟိန်း စည်,0
1933,နိုင် ဝင်း အေး,0
652,ခိုင် မူ ယာ လင်း,1
2832,ရ တ နာ ချို,1
...,...,...
1103,ဆု လဲ့ ဖြူ,1
273,ကောင်း နိုင် ခ,0
2485,မြတ် သု ခ,0
1850,နှင်း ပွင့် ဖြူ,1


In [5]:
train_size = int(dataset.shape[0]*0.7)

train= dataset.head(train_size)

test= dataset.tail(dataset.shape[0]-train_size)

In [6]:
train

Unnamed: 0,Name,Gender
1843,နှင်း နှင်း ဦး,1
3764,ဟိန်း စည်,0
1933,နိုင် ဝင်း အေး,0
652,ခိုင် မူ ယာ လင်း,1
2832,ရ တ နာ ချို,1
...,...,...
1147,ဇင် ထူး ပိုင်,0
3437,သန္တာ ဇင်,1
2097,ပိုင် သူ ကျော်,0
97,ကျော် မျိုး လွင်,0


In [7]:
test

Unnamed: 0,Name,Gender
3255,ဝေ စိုး စံ,0
2816,ယု ယု ဇင်,1
1929,နိုင် လင်း ဦး,0
2924,ရွှေ သ ဇင် ဦး,1
3135,လေး အောင်,0
...,...,...
1103,ဆု လဲ့ ဖြူ,1
273,ကောင်း နိုင် ခ,0
2485,မြတ် သု ခ,0
1850,နှင်း ပွင့် ဖြူ,1


In [8]:
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 9
vocab_size = 413
trunc_type='post'        #put needed '0's for max length
oov_tok = "<OOV>"        #out of vocab


tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(train['Name'])

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(train['Name'])
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type,padding='post')

In [9]:
padded

array([[ 28,  28,  14, ...,   0,   0,   0],
       [ 37, 109,   0, ...,   0,   0,   0],
       [ 18,   4,  23, ...,   0,   0,   0],
       ...,
       [ 45,  12,   3, ...,   0,   0,   0],
       [  3,  22,  24, ...,   0,   0,   0],
       [110, 110,  38, ...,   0,   0,   0]], dtype=int32)

In [10]:
word_index

{'<OOV>': 1,
 'အောင်': 2,
 'ကျော်': 3,
 'ဝင်း': 4,
 'ထက်': 5,
 'မြတ်': 6,
 'ထွန်း': 7,
 'ဇင်': 8,
 'မင်း': 9,
 'ခင်': 10,
 'လင်း': 11,
 'သူ': 12,
 'ဇော်': 13,
 'ဦး': 14,
 'စိုး': 15,
 'ဝေ': 16,
 'ခိုင်': 17,
 'နိုင်': 18,
 'မိုး': 19,
 'ဖြိုး': 20,
 'အိ': 21,
 'မျိုး': 22,
 'အေး': 23,
 'လွင်': 24,
 'မြင့်': 25,
 'မေ': 26,
 'သီ': 27,
 'နှင်း': 28,
 'ဆု': 29,
 'သက်': 30,
 'ကို': 31,
 'စု': 32,
 'ဦး': 33,
 'ရ': 34,
 'မွန်': 35,
 'ပြည့်': 36,
 'ဟိန်း': 37,
 'လှိုင်': 38,
 'ရွှေ': 39,
 'ဇာ': 40,
 'ရည်': 41,
 'စံ': 42,
 'ဟန်': 43,
 'ဖြူ': 44,
 'ပိုင်': 45,
 'ရဲ': 46,
 'ခန့်': 47,
 'တင်': 48,
 'သ': 49,
 'နွယ်': 50,
 'ကောင်း': 51,
 'ညီ': 52,
 'ဖူး': 53,
 'သိန်း': 54,
 'ငြိမ်း': 55,
 'ထူး': 56,
 'မောင်': 57,
 'မာ': 58,
 'လဲ့': 59,
 'တ': 60,
 'ကေ': 61,
 'နန်း': 62,
 'ယု': 63,
 'သင်း': 64,
 'နီ': 65,
 'ရိ': 66,
 'မြ': 67,
 'နာ': 68,
 'စင်': 69,
 'ကြည်': 70,
 'သော်': 71,
 'ထွေး': 72,
 'မီ': 73,
 'သန်း': 74,
 'နေ': 75,
 'ချမ်း': 76,
 'ဆွေ': 77,
 'ဝါ': 78,
 'တာ': 79,
 'သန့်': 80,
 'သန္တာ': 81,
 'ပ'

# 3. Modeling

In [11]:
import tensorflow as tf

embedding_dim = 64

model = tf.keras.Sequential([

    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),

    tf.keras.layers.Flatten(),

    tf.keras.layers.Dense(1000, activation='relu'),

    tf.keras.layers.Dense(500, activation='relu'),

    tf.keras.layers.Dense(1, activation='sigmoid')

])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])



# 4. Model Training

In [12]:
import numpy as np



num_epochs = 20

history = model.fit(padded, np.array(train['Gender']), epochs=num_epochs,verbose = 2)

Epoch 1/20


I0000 00:00:1730785754.295587      67 service.cc:145] XLA service 0x7d24ac008370 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1730785754.295643      67 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1730785754.295648      67 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1730785755.497583      67 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


95/95 - 4s - 39ms/step - accuracy: 0.8542 - loss: 0.3135
Epoch 2/20
95/95 - 0s - 2ms/step - accuracy: 0.9578 - loss: 0.1099
Epoch 3/20
95/95 - 0s - 2ms/step - accuracy: 0.9733 - loss: 0.0747
Epoch 4/20
95/95 - 0s - 2ms/step - accuracy: 0.9802 - loss: 0.0591
Epoch 5/20
95/95 - 0s - 2ms/step - accuracy: 0.9848 - loss: 0.0416
Epoch 6/20
95/95 - 0s - 2ms/step - accuracy: 0.9878 - loss: 0.0326
Epoch 7/20
95/95 - 0s - 2ms/step - accuracy: 0.9898 - loss: 0.0288
Epoch 8/20
95/95 - 0s - 2ms/step - accuracy: 0.9904 - loss: 0.0223
Epoch 9/20
95/95 - 0s - 2ms/step - accuracy: 0.9924 - loss: 0.0190
Epoch 10/20
95/95 - 0s - 2ms/step - accuracy: 0.9957 - loss: 0.0134
Epoch 11/20
95/95 - 0s - 2ms/step - accuracy: 0.9967 - loss: 0.0094
Epoch 12/20
95/95 - 0s - 2ms/step - accuracy: 0.9984 - loss: 0.0038
Epoch 13/20
95/95 - 0s - 2ms/step - accuracy: 0.9980 - loss: 0.0084
Epoch 14/20
95/95 - 0s - 2ms/step - accuracy: 0.9997 - loss: 0.0017
Epoch 15/20
95/95 - 0s - 2ms/step - accuracy: 0.9970 - loss: 0.0114

# 5. Evaluation

In [13]:
testing_sequences = tokenizer.texts_to_sequences(test['Name'])

testing_padded = pad_sequences(testing_sequences,maxlen=max_length, truncating=trunc_type,padding='post')

In [14]:
predicted_results = model.predict(testing_padded)

y_pred = []

threshold = 0.5

for  i in predicted_results:

  current_val = i[0]

  if (current_val>=threshold):

    y_pred.append(1)

  else:

    y_pred.append(0)


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [15]:
#Confusion Matrix

from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(test['Gender'],y_pred)

print('confusion_matrix')

print(confusion_matrix)

confusion_matrix
[[528  31]
 [ 36 705]]


In [16]:
# Evaluation Metrics





from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

ans = y_pred

print('Accuracy Score',accuracy_score(test['Gender'],ans)*100,'%')

print('Precision Macro Score',precision_score(test['Gender'],ans,average = 'macro')*100,'%')

print('Recall_Score',recall_score(test['Gender'],ans, average = 'macro')*100,'%')

print('F1_Score',f1_score(test['Gender'],ans, average = 'macro')*100,'%')

Accuracy Score 94.84615384615384 %
Precision Macro Score 94.70253237742831 %
Recall_Score 94.79804161566707 %
F1_Score 94.74880793116898 %


# 6. Saving

In [17]:
model.save('name_gender.keras')

# 7. Integration

In [18]:
inputs = ['ယွန်းထက်ဝင်း']

inputs = [re.sub(r'(?:(?<!္)([က-ဪဿ၊-၏]|[၀-၉]+|[^က-၏]+)(?![ှျ]?[့္်]))', r' \1', i) for i in inputs]
print(inputs)

testing_sequences = tokenizer.texts_to_sequences(inputs)
testing_padded = pad_sequences(testing_sequences,maxlen=9, truncating=trunc_type,padding='post')
gender = tf.keras.models.load_model('/kaggle/working/name_gender.keras')

ans = gender.predict(testing_padded)

# Apply threshold
binary_result = (ans >= 0.5).astype(int)
if binary_result == 0:
    print('Male')
else:
    print('Female')

[' ယွန်း ထက် ဝင်း']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 215ms/step
Female
