#### Imports

In [3]:
import re
#import os
from dotenv import load_dotenv
load_dotenv()
import fasttext
from datasets import load_dataset
#from huggingface_hub import login
from tqdm.auto import tqdm
from gensim.models import Word2Vec, FastText
import pickle

#### Downloads

Comparison Vectors

In [2]:
#fasttext.util.download_model('hi', if_exists='ignore') # i downloaded from the browser
comparison_model = fasttext.load_model('cc.hi.300.bin')

AI4Bharat Hindi Sangraha

In [4]:
def clean_bangla_text(text):
    cleaned = re.sub(r'[^\u0980-\u09FF\s]', '', text) 
    # devanagri unicode range: regex basically says nothing outside 
    # of bengali unicode block, and whitespace, should remain
    return cleaned.split()
    # hindi tokens (fast version)

In [5]:
#login(token=os.environ.get('HF_TOKEN'))
dataset = load_dataset("ai4bharat/sangraha", data_dir="verified/ben", split="train", streaming=True)
SENTENCE_COUNT = 1000000

In [None]:
corpus = []
print(f"[Extracting sentences...]")
for i, sample in tqdm(enumerate(dataset)):
    if i >= SENTENCE_COUNT:
        break
    
    tokens = clean_bangla_text(sample['text'])
    if len(tokens) > 3: # substantial sentences only
        corpus.append(tokens)

with open('corpus.pkl', 'wb') as f:
    pickle.dump(corpus, f)

print('[Finished.]')

[Extracting sentences...]


1000000it [38:49, 429.22it/s]


In [None]:
with open('corpus.pkl', 'rb') as f:
    corpus = pickle.load(f)

for doc in corpus:
    if len(doc) <= 10:
        print("Sample:", doc)
        break

Train Word Vector Models

In [None]:
import logging
from importlib import reload

reload(logging)

logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.INFO,
    force=True
)

In [None]:
# model that matches comparison model's architecture best
ai4b_subset_fair_model = FastText(
    sentences=corpus, 
    vector_size = 300, # fair comparison with fasttext, which also has 300 vectors
    window = 5, # same window size as fasttext, fair comparison
    sg = 0, # use cbow, for fair comparison with fasttext
    negative = 10, # negative sampling same as fasttext

    # n grams of size 5, closest to fasttext
    min_n = 5, 
    max_n = 5,
    workers = 8        
)

ai4b_subset_fair_model.save("ai4b_subset_fair.model")

In [None]:
# model that has no n-grams
ai4b_subset_w2v_model = Word2Vec(
    sentences=corpus, 
    vector_size = 300, # fair comparison with fasttext, which also has 300 vectors
    window = 5, # same window size as fasttext, fair comparison
    sg = 0, # use cbow, for fair comparison with fasttext
    negative = 10, # negative sampling same as fasttext

    workers = 8        
)

ai4b_subset_w2v_model.save("ai4b_subset_w2v.model")

Visualization

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [None]:
def plot_words(model, words=['king', 'queen', 'man', 'woman']):

    if hasattr(model, 'wv'):
        get_vec = lambda w: model.wv[w]
    else:
        get_vec = lambda w: model.get_word_vector(w)
    
    vecs = [get_vec(w) for w in words]
    
    pca = PCA(n_components=2)
    coords = pca.fit_transform(vecs)
    
    # Plot
    plt.figure(figsize=(6, 6))
    plt.scatter(coords[:, 0], coords[:, 1], c='royalblue')
    
    for i, word in enumerate(words):
        plt.annotate(word, (coords[i, 0], coords[i, 1]), 
                    xytext=(5, 5), textcoords='offset points', fontsize=12)
    
    plt.title('Word Vectors (PCA)')
    plt.axis('equal')
    plt.tight_layout()
    plt.show()

In [None]:
plot_words(comparison_model)

In [None]:
plot_words(ai4b_subset_fair_model)

In [None]:
plot_words(ai4b_subset_w2v_model)