<a href="https://colab.research.google.com/github/SaranAI/thaichar2vec/blob/main/notebooks/research/train_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# About this notebook
- This notebook implemented skip-gram and cbow model on thai corpus to obtain thai embedding
- Although most cases are better ะน use (sub)word over character embedding, however, it might be useful for some cases like
1. name generation
2. thai word cut

## Goal
- To maximize the entropy of the work in corpus

## Method
- Obtain Corpora
  - name-surname corpus
  - thai vocabulary ฉบับราชบั
- Train
  - skipgram
  - cbow
  - negative samples
  - glove
- Visualize

In [None]:
# Import Data
%%capture

# Name-surname corpus
!wget -O male_name.txt https://raw.githubusercontent.com/SaranAI/thaichar2vec/main/artifacts/dataset/male_names_th.txt
!wget -O female_name.txt https://raw.githubusercontent.com/SaranAI/thaichar2vec/main/artifacts/dataset/female_names_th.txt
!wget -O surname.txt https://raw.githubusercontent.com/SaranAI/thaichar2vec/main/artifacts/dataset/surnames_th.txt

# Thai Lexicon Vocabulary
!wget -O lexicon.txt https://raw.githubusercontent.com/SaranAI/thaichar2vec/main/artifacts/dataset/lexicon_th.txt

In [None]:
from typing import List, Set, Union, Dict, Tuple
import os

# Define functions
def open_txt(txt_file:str)->List:
  """
  Read lines from txt file and return a list where
  each list element represent each line
  """
  output_list = []
  with open(txt_file,"r") as f:
    lines = f.readlines()
    for line in lines:
     output_list.append(line.strip())
  return output_list

def get_corpora(corpus_file_names:List[str],
                root_dir:str="/content",
                verbose=True)->Set[str]:

  """
  Import all corpus files

  Output:
    - set of all words (set)
  """

  corpus_list = []
  for corpus_file_name in corpus_file_names:
    corpus = open_txt(os.path.join(root_dir,corpus_file_name))
    print(f"len({corpus_file_name})={len(corpus)}") if verbose else None
    corpus_list.append(corpus)

  corpora = set()
  for corpus in corpus_list:
    corpora.update(corpus)

  print(f"{len(corpora)=}") if verbose else None
  return corpora

In [None]:
corpus_file_name = ["male_name.txt",
                    "female_name.txt",
                    "surname.txt",
                    "lexicon.txt"]

corpora = get_corpora(corpus_file_name,verbose=True)

len(male_name.txt)=7124
len(female_name.txt)=5098
len(surname.txt)=9836
len(lexicon.txt)=37675
len(corpora)=56983


- We found some of vocabulary word contain no thai character, so we will remove the word contain that out,

In [None]:
WHITELIST = list("กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮฤฦะัาำิีึืุูเแโใไๅํ็่้๊๋ฯฺๆ์ํ")
# Pop duplicated
WHITELIST = sorted(list(set(WHITELIST)))

def preprocess(corpus:Set[str],
               whitelist:Union[List,str]=WHITELIST,
               verbose:bool=True)->Set[str]:
  """
  Perform preprocessing to the vocabulary inside corpus

  Note:
    This function mutates the corpus in-place
  """
  # REF:Obtain and partially remove some from Pythainlp.thai_characters

  for word in corpus.copy():
    # Inspect in character level
    for char in word:
      if char not in whitelist:
        corpus.remove(word)
        print(f'Remove word "{word}" containing "{char}"') if verbose else None
        break

  print(f"{len(corpus)=}" if verbose else None)
  return corpus

In [None]:
corpora = preprocess(corpora, WHITELIST)

Remove word "ศาป-" containing "-"
Remove word "สุ ๆ" containing " "
Remove word "หลัด ๆ" containing " "
Remove word "มาตร-" containing "-"
Remove word "หย่ง ๆ" containing " "
Remove word "นักขัต-" containing "-"
Remove word "ปริกรรม-" containing "-"
Remove word "จิต-" containing "-"
Remove word "ทาส-" containing "-"
Remove word "อปาน-" containing "-"
Remove word "จริม-" containing "-"
Remove word "ฉันท-" containing "-"
Remove word "ชย-" containing "-"
Remove word "กิเลส-" containing "-"
Remove word "อดิเรก-" containing "-"
Remove word "พะเยิบ ๆ" containing " "
Remove word "คัพภ-" containing "-"
Remove word "จตุตถ-" containing "-"
Remove word "เบญจางค-" containing "-"
Remove word "หุต-" containing "-"
Remove word "ปัด ๆ เป๋ ๆ" containing " "
Remove word "ศิร-" containing "-"
Remove word "ติ๋ง ๆ" containing " "
Remove word "สามัญ-" containing "-"
Remove word "แอบ ๆ" containing " "
Remove word "อเสกข-" containing "-"
Remove word "รู้หลบเป็นปีก รู้หลีกเป็นหาง" containing " "
Remove word "น

- Make sure that the all characters from whitelist was contained in this corpus

In [None]:
def check_diff_charset(corpus:Set[str],
                       whitelist:Union[List[str],str])->Set[str]:
  """
    Check and return the difference in character sets between a corpus of words and a whitelist.

    Args:
        corpus (Set[str]): A set of words to examine for character set differences.
        whitelist (Union[List[str], str]): A whitelist of characters (either a list of characters or a string).

    Returns:
        Set[str]: A set of characters present in the whitelist but not in the corpus.
    """
  whitelist_charset = set(whitelist)
  corpus_charset = set()
  for word in corpus:
    # Inspect in character level
    for char in word:
      corpus_charset.add(char)

  diff_charset = whitelist_charset.difference(corpus_charset)
  return diff_charset


In [None]:
diff_charset = check_diff_charset(corpora,WHITELIST)

# Since our corpus does not contain
# Get Corpus Char list
CHARLIST = WHITELIST.copy()
for diff_char in diff_charset:
  CHARLIST.remove(diff_char)

In [None]:
# Initiate some type hint
from typing import TypedDict
from collections import OrderedDict

def make_vocabulary(charlist:List[str],
                    add_special_token:bool=True)->Dict[str,int]:
  """
  Make vocabulary where a key represent character token and a value represent id token from character list

  Args:
  1. charlist
    A list of character token derived from the corpus
  2. add_special_token :
    adding the special tokn which does not derived from input containing <PADDING>

  Example:
    vocabulary = {"ก" : 0,
                  "ข" : 1,
                  "ค" : 2}
  """
  vocabulary = OrderedDict()

  # Add Special Token
  if add_special_token:
    special_token_list = ["<pad>","<unk>"]
    for i,char in enumerate(special_token_list):
      vocabulary.update({char : i})

  # Add char from charlist
  start_idx = len(vocabulary)
  for i,char in enumerate(charlist,start_idx):
    vocabulary.update({char : i})

  return vocabulary

In [None]:
vocabulary = make_vocabulary(CHARLIST)
reverse_vocabulary = {v: k for k, v in vocabulary.items()}
print(f"{len(vocabulary)=}")
# vocab

len(vocabulary)=71


In [None]:
# Let check total characters
from functools import reduce
total_char_sum = reduce(lambda x, y: x + len(y), corpora, 0)
print(f"Total characters in corpus : {total_char_sum}")

Total characters in corpus : 416715


## Make Dataset for each architecture
- Word2vec Skipgram
- Word2vec CBOW



### 1. Word2vec ▶



#### Make DataFrame

In [None]:
import pandas as pd

# Now we make a function which turn corpus into datas
def make_word2vec_dataset(corpus:str,
                          window_size:int=2,
                          method="skipgram",
                          add_pad_token:bool=True)->pd.DataFrame:

  if method not in ["skipgram","cbow"]:
    raise NotImplementedError(f"Make sure method is skipgram or cbow")

  dataset = []
  for word in corpus:
    rows = word_to_skipgram_row(word,window_size,add_pad_token)
    dataset.extend(rows)

  # Convert tuple to pandas
  dataset = pd.DataFrame(dataset, columns=['target','context'])

  if method=="cbow":
    # We swap column if
    dataset = dataset.reindex(columns=["context","target"])

  return dataset

def word_to_skipgram_row(word:str,
                         window_size:int=2,
                         add_pad_token:bool=True)->List[tuple]:
  """
    add_pad_token [bool] : add padding token
  """
  rows = []

  for idx in range(len(word)):
    # assign the current char
    char = word[idx]

    # Get prior index
    prev_idx = idx - window_size
    next_idx = idx + window_size
    # Create new instance where context as previous
    for i in range(prev_idx,idx):
      # Context as pad if out of border
      if i < 0:
        if add_pad_token:
          row = (char,"<pad>")
        else:
          # Skip if not add_pad_token
          continue
      else:
        row = (char,word[i])
      rows.append(row)

    # Create new instance where context as next
    for i in range(idx+1,next_idx+1):
      # Create new instance
      if i > len(word)-1:
        if add_pad_token:
          row = (char,"<pad>")
        else:
          # Skip if not add_pad_token
          continue
      else:
        row = (char,word[i])
      rows.append(row)

  return rows

def token_to_id_dataset(dataset:pd.DataFrame,
                        vocabulary:Dict[str,int])->pd.DataFrame:
  """
    Map tokens in the 'target' and 'context' columns of the DataFrame to their integer IDs based on the provided vocabulary.

    Args:
    - dataset (pd.DataFrame): The input DataFrame with 'target' and 'context' columns containing tokens to be mapped.
    - vocabulary (Dict[str, int]): A dictionary that maps tokens (str) to their corresponding integer IDs (int).

    Returns:
    - pd.DataFrame: The modified DataFrame with 'target' and 'context' columns replaced by integer IDs.
  """
  dataset['target'] = dataset['target'].map(vocabulary)
  dataset['context'] = dataset['context'].map(vocabulary)
  return dataset

In [None]:
cbow_df = make_word2vec_dataset(corpora,
                                window_size=2,
                                method="cbow",
                                add_pad_token=False)

skipgram_df = make_word2vec_dataset(corpora,
                                     window_size=2,
                                     method="skipgram",
                                     add_pad_token=False)
print("CBOW Dataset :")
display(cbow_df)

print("Skipgram Dataset :")
display(skipgram_df)

CBOW Dataset :


Unnamed: 0,context,target
0,ก,ไ
1,ร,ไ
2,ไ,ก
3,ร,ก
4,ว,ก
...,...,...
1334217,ก,ร
1334218,ร,ร
1334219,ม,ร
1334220,ร,ม


Skipgram Dataset :


Unnamed: 0,target,context
0,ไ,ก
1,ไ,ร
2,ก,ไ
3,ก,ร
4,ก,ว
...,...,...
1334217,ร,ก
1334218,ร,ร
1334219,ร,ม
1334220,ม,ร


In [None]:
# Convert them to integer
cbow_df = token_to_id_dataset(cbow_df,vocabulary)
skipgram_df = token_to_id_dataset(skipgram_df,vocabulary)

In [None]:
cbow_df

Unnamed: 0,context,target
0,2,62
1,36,62
2,62,2
3,36,2
4,40,2
...,...,...
1334217,2,36
1334218,36,36
1334219,34,36
1334220,36,34


#### Make Torch Dataset & Loader

In [None]:
# Define a custom dataset
from torch.utils.data import DataLoader, Dataset
class Word2VecDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        label = self.labels[idx]

        if self.transform:
            sample = self.transform(sample)

        return sample, label

In [None]:
# Create Dataset for both Skipgram and CBOW
## Notice, The data for skipgram and cbow is just a swap
skipgram_dataset = Word2VecDataset(data=cbow_df["target"],
                                   labels=cbow_df["context"],
                                   transform=None)

cbow_dataset = Word2VecDataset(data=skipgram_df["context"],
                              labels=skipgram_df["target"],
                              transform=None)

In [None]:
batch_size = 128
skipgram_loader = DataLoader(skipgram_dataset, batch_size, shuffle=True)
cbow_loader = DataLoader(cbow_dataset, batch_size, shuffle=True)

In [None]:
# To easily instantiate dataloader by setting, use later on hyperparameter fine tuning
def make_word2vec_dataloader(corpora,
                             vocabulary,
                             window_size=2,
                             method="cbow",
                             add_pad_token:bool=False,
                             batch_size:int=16
                             )->DataLoader:

  # Define X,y
  x,y = (("context","target") if method=="cbow"
         else ("target","context"))

  # Make Pandas dataframe
  df = make_word2vec_dataset(corpora,
                             window_size,
                             method,
                             add_pad_token)
  df = token_to_id_dataset(df,vocabulary)

  # Make Pytorch Dataset
  dataset = Word2VecDataset(data=df[x],
                            labels=df[y],
                            transform=None)

  loader = DataLoader(dataset, batch_size, shuffle=True)
  return loader

#### Make Architecture

- Since our dataset split into a pair of context and target individually
  - Instead of (1 -> (0,3))
    We split into (1,0), (1,3)
  - So, the architecture for both method `skipgram` and `cbow` are the same

  ![skipgram](https://www.researchgate.net/publication/322905432/figure/fig1/AS:614314310373461@1523475353979/The-architecture-of-Skip-gram-model-20.png)

  - We treat each pair of target and different context as different instance
  - Hence, let's called word2vec architecture

In [None]:
# Define a simple neural network model
import torch.nn as nn
import torch.nn.functional as F
import torch

class Word2VecModel(nn.Module):
    def __init__(self,vocab_size,emb_size):
      super(Word2VecModel, self).__init__()
      # Typically, emb_size is lower than vocab_size
      self.embedding = nn.Embedding(vocab_size, emb_size)
      self.fc1 = nn.Linear(emb_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.fc1(x)
        return x

- ⚠ Note : embedding layer take input token as input not the one hot encoding unlike linear layer
- This is why our dataset does not convert id token into one hot

In [None]:
def test_forward_word2vec():
  # Generate two one-hot vector : One hot for class 2 and 10 with size 71
  vocab_size = 71
  emb_size = 15
  class_indices = [2,10]
  input_vector = torch.tensor([0,3]) # (<pad>,ข)
  # Initiate model
  word2vec = Word2VecModel(vocab_size,emb_size)
  # Test Forward
  print(f"{input_vector.shape=}")
  out = word2vec(input_vector)
  return out

out = test_forward_word2vec()
out.shape

input_vector.shape=torch.Size([2])


torch.Size([2, 71])

#### Make Trainer
- Use to be a medium for data and architecture

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"{device=}")

device=device(type='cuda')


In [None]:
import torch.optim as optim
from tqdm import tqdm
import numpy as np
import os

def train_step(model, inputs, labels, criterion, optimizer, device):
  optimizer.zero_grad()
  inputs = inputs.to(device)  # Move inputs to GPU
  labels = labels.to(device)  # Move labels to GPU
  # Forward
  outputs = model(inputs)  # shape:(bs x vocab_size)
  # Backward
  loss = criterion(outputs, labels)
  loss.backward()
  optimizer.step()

  return loss.item()

def train_epoch(model, train_loader, criterion, optimizer, device):
  total_loss = 0.0
  loop = tqdm(train_loader,desc="Training")

  for i,(inputs, labels) in enumerate(loop):
    loss = train_step(model, inputs, labels, criterion, optimizer, device)
    total_loss += loss
    loop.set_postfix(loss=loss)
  return total_loss

def train(model,
          train_loader,
          criterion,
          optimizer,
          model_path:str,
          num_epochs=5,
          device='cuda'):
  """

  Arg:
    model_path - the destination path that store best model as best.pt and final model as final.pt
  """
  best_loss = np.inf
  model.to(device)
  for epoch in range(num_epochs):
    total_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    mean_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}] Loss: {mean_loss}")
    # Save best model
    if best_loss > mean_loss:
      best_loss = mean_loss
      print(f"Best model saved at epoch {epoch + 1}")
      torch.save(model.state_dict(), os.path.join(model_path,'best.pth'))

  # Save Final model
  print(f"Final model saved at epoch {epoch + 1}")
  torch.save(model.state_dict(), os.path.join(model_path,'final.pth'))
  final_loss = mean_loss
  return model, best_loss, final_loss

#### Start training CBOW

In [None]:
# Mount to store model in here
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datetime import datetime
now = datetime.now()
now = now.strftime("%Y-%m-%d %H:%M:%S")

In [None]:
import os
import json
from datetime import datetime

# Initiate meta
vocab_size = len(vocabulary)
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
model_path = f'/content/drive/MyDrive/thaichar2vec/cbow/{now}'
if not os.path.exists(model_path): os.makedirs(model_path) # Make directory if not exist

# Define Hyperparameter
method="cbow"
add_pad_token = False
batch_size = 512
emb_size = 32
num_epochs = 2
learning_rate = 1e-3
window_size = 2

# Create Dataloader
cbow_loader = make_word2vec_dataloader(corpora,
                                       vocabulary,
                                       window_size,
                                       method,
                                       add_pad_token,
                                       batch_size)

# Define LOAD
model = Word2VecModel(vocab_size,emb_size).to(device)
# Criterion as sigmoid if isNegativeSampling
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Write model metadata
with open(os.path.join(model_path,"hyperparameter.json"),"w") as f:
  dictionary = dict(method=method,
                    add_pad_token=add_pad_token,
                    batch_size=batch_size,
                    emb_size=emb_size,
                    num_epochs=num_epochs,
                    learning_rate=learning_rate,
                    window_size=window_size)

  json_object = json.dumps(dictionary, indent=4)
  f.write(json_object)

# Start training CBOW
model, best_loss, final_loss = train(model,
                                    cbow_loader,
                                    criterion,
                                    optimizer,
                                    model_path,
                                    num_epochs)

# Submit Result
with open(os.path.join(model_path,"result.json"),"w") as f:
  dictionary = dict(best_loss=best_loss,
                    final_loss=final_loss)

  json_object = json.dumps(dictionary, indent=4)
  f.write(json_object)

Training: 100%|██████████| 2606/2606 [00:25<00:00, 103.14it/s, loss=3.49]


Epoch [1/2] Loss: 3.605174011938954
Best model saved at epoch 1


Training: 100%|██████████| 2606/2606 [00:26<00:00, 98.00it/s, loss=3.53] 


Epoch [2/2] Loss: 3.5243789613567125
Best model saved at epoch 2
Final model saved at epoch 2


In [None]:
import numpy as np
from scipy.spatial import distance

emb_matrix = model.embedding.weight.data.cpu().numpy()
print(f"{emb_matrix.shape = }")

def check_similarity(char:str,
                     vocabulary:Dict[int,str],
                     top_k:int=5,
                     metric="euclidean",
                     emb_matrix=emb_matrix):

  # Initiate reverse vocabulary
  reverse_vocabulary = {v: k for k, v in vocabulary.items()}

  char_id = vocabulary[char]
  # GEt row vector of particular char_id
  char_vector = emb_matrix[[char_id],:]

  # Calculate the pairwise Euclidean distances
  distances = distance.cdist(emb_matrix,
                             char_vector,
                             metric="euclidean")

  # Get the row indices sorted by minima
  sorted_indices = np.argsort(distances.flatten())
  sorted_distance = np.sort(distances.flatten())

  # The top_k rows with the smallest minima
  top_k_indices = sorted_indices[:top_k+1]
  top_k_distances = sorted_distance[:top_k+1]

  result = dict()
  for index,dist in zip(top_k_indices,top_k_distances):
    char_word = reverse_vocabulary[index]
    result.update({char_word : dist})
  return result

print(check_similarity("ิ",vocabulary,10))
print(check_similarity("ก",vocabulary,10))
print(check_similarity("เ",vocabulary,10))
print(check_similarity("ส",vocabulary,10))

emb_matrix.shape = (71, 32)
{'ิ': 0.0, 'โ': 5.696779550942949, 'ย': 6.572780448736194, 'ฆ': 6.585936260976982, 'แ': 6.62388078066824, 'ฐ': 7.011308207575245, 'บ': 7.033556413519026, '๋': 7.069697899191908, 'ั': 7.071214786024204, 'ธ': 7.082123593448585, 'ฟ': 7.102788998005578}
{'ก': 0.0, 'ฮ': 6.076605752142675, 'ถ': 6.3973260298222225, 'ใ': 6.66823269562889, 'ฉ': 6.733938555616427, '๊': 6.827010247384142, 'โ': 6.854686485547427, 'ส': 7.043783525847933, 'ฐ': 7.116185832777028, 'ฟ': 7.16147937231453, 'ผ': 7.173123767587195}
{'เ': 0.0, 'ึ': 7.265027150940945, 'ถ': 7.266010190469351, '๊': 7.425246426111248, 'ต': 7.505804697737943, 'พ': 7.5988173198640885, 'ฝ': 7.631039109073963, 'ฌ': 7.672995848528979, 'ย': 7.684784462953036, 'ม': 7.8900829639690455, 'แ': 7.9086600727078595}
{'ส': 0.0, 'ถ': 5.938120327488623, 'ฮ': 5.983036903509235, 'ฝ': 6.452440255163176, 'ช': 6.726245419054221, 'ฉ': 6.824619034895185, 'ฆ': 6.897707068248036, 'ก': 7.043783525847933, 'ล': 7.0542631933428135, 'ู': 7.1151999