#### Imports

In [2]:
import re
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
from datasets import load_dataset
from tqdm.auto import tqdm
from gensim.models.fasttext import load_facebook_model

#### Downloads

Comparison Vectors - via interent (`.bin` file)

In [3]:
commoncrawl = load_facebook_model("embeddings/cc.bn.300.bin")
commoncrawl.save("embeddings/cc.bn.300.model")

AI4Bharat Hindi Sangraha

In [3]:
def clean_bangla_text(text):
    
    text_latin_removed = re.sub(r'[a-zA-Z]', '', str(text))
    
    # add a space on anything that
    # 1. isn't in the bangla block
    # 2. isn't a halant
    # 3. isn't a numric block

    # to treat it as a non-latin symbol
    text_spaced = re.sub(r'([^\u0980-\u09FF\u200C\u200D0-9\s])', r' \1 ', text_latin_removed)
    
    # spit into tokens (including each non latin symbol as a seperate token)
    return text_spaced.split()

sample_text = "আধুনিক ইংরেজি বর্ণমালা এবং আইএসও মৌলিক লাতিন বর্ণমালায় টি অক্ষরের অবস্থান কোথায়?"
# According to the information provided, Dhaka city of Bangladesh has a ferry service with areas like Chandpur, Barisal, Narayanganj, etc. Therefore, Chandpur is a city where a ferry service is available.
print(clean_bangla_text(sample_text))

['আধুনিক', 'ইংরেজি', 'বর্ণমালা', 'এবং', 'আইএসও', 'মৌলিক', 'লাতিন', 'বর্ণমালায়', 'টি', 'অক্ষরের', 'অবস্থান', 'কোথায়', '?']


In [4]:
#login(token=os.environ.get('HF_TOKEN'))
dataset = load_dataset("ai4bharat/sangraha", data_dir="verified/ben", split="train", streaming=True)

SENTENCE_COUNT = 1000000
CHUNK_SIZE = 100000
corpus_chunk = []
chunk_index = 0

In [None]:
print('[Extracting sentences...]')
for i, sample in tqdm(enumerate(dataset), total=SENTENCE_COUNT):
    if i >= SENTENCE_COUNT:
        break
    
    tokens = clean_bangla_text(sample['text'])
    corpus_chunk.append(tokens)
    
    if len(corpus_chunk) >= CHUNK_SIZE:
        df_chunk = pd.DataFrame({'tokens': corpus_chunk})
        df_chunk.to_parquet(f'data/sangraha/chunk_{chunk_index}.parquet', engine='pyarrow')
        
        corpus_chunk = []
        chunk_index += 1

print('[Finished.]')

[Extracting sentences...]


100%|██████████| 1000000/1000000 [24:14<00:00, 687.57it/s] 


[Finished.]


Restart the notebook after this.

In [1]:
import pandas as pd
from gensim.models import Word2Vec, FastText
import logging
import sys
from importlib import reload

Load corpus

In [2]:
df = pd.read_parquet('data/sangraha', engine='pyarrow')

corpus = [list(tokens) for tokens in df['tokens']]
print(f"[Loaded {len(corpus)} documents.]")

[Loaded 1000000 documents.]


Configure logging

In [5]:
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(
    stream=sys.stdout,
    format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.INFO,
    force=True
)

logging.info("[Logs ready.]")

2026-02-23 04:10:32,968 : INFO : [Logs ready.]


Train Word Vector Models

In [4]:
# model that matches comparison model's architecture best
ai4b_subset_fair_model = FastText(
    sentences=corpus, 
    vector_size = 300, # fair comparison with fasttext, which also has 300 vectors
    window = 5, # same window size as fasttext, fair comparison
    sg = 0, # use cbow, for fair comparison with fasttext
    negative = 10, # negative sampling same as fasttext

    # n grams of size 5, closest to fasttext
    min_n = 5, 
    max_n = 5,
    workers = 8        
)

ai4b_subset_fair_model.save("embeddings/ai4b_subset_fair.model")

2026-02-23 02:13:17,031 : INFO : collecting all words and their counts
2026-02-23 02:13:17,031 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2026-02-23 02:13:17,794 : INFO : PROGRESS: at sentence #10000, processed 4615151 words, keeping 263507 word types
2026-02-23 02:13:18,556 : INFO : PROGRESS: at sentence #20000, processed 9415503 words, keeping 421830 word types
2026-02-23 02:13:19,263 : INFO : PROGRESS: at sentence #30000, processed 14020958 words, keeping 545987 word types
2026-02-23 02:13:20,077 : INFO : PROGRESS: at sentence #40000, processed 18530511 words, keeping 655022 word types
2026-02-23 02:13:20,878 : INFO : PROGRESS: at sentence #50000, processed 23178940 words, keeping 762970 word types
2026-02-23 02:13:21,619 : INFO : PROGRESS: at sentence #60000, processed 27879473 words, keeping 863341 word types
2026-02-23 02:13:22,456 : INFO : PROGRESS: at sentence #70000, processed 32527610 words, keeping 955692 word types
2026-02-23 02:13:23,312 : I

In [6]:
# skip-gram model
ai4b_subset_sg_model = FastText(
    sentences=corpus, 
    vector_size = 300, # fair comparison with fasttext, which also has 300 vectors
    window = 5, # same window size as fasttext, fair comparison
    
    sg = 1, # set skip-gram!

    negative = 10, # negative sampling same as fasttext
    # n grams of size 5, closest to fasttext
    min_n = 5, 
    max_n = 5,
    workers = 8        
)

ai4b_subset_sg_model.save("embeddings/ai4b_subset_sg.model")

2026-02-23 04:10:40,121 : INFO : collecting all words and their counts
2026-02-23 04:10:40,123 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2026-02-23 04:10:41,163 : INFO : PROGRESS: at sentence #10000, processed 4615151 words, keeping 263507 word types
2026-02-23 04:10:42,096 : INFO : PROGRESS: at sentence #20000, processed 9415503 words, keeping 421830 word types
2026-02-23 04:10:43,018 : INFO : PROGRESS: at sentence #30000, processed 14020958 words, keeping 545987 word types
2026-02-23 04:10:44,019 : INFO : PROGRESS: at sentence #40000, processed 18530511 words, keeping 655022 word types
2026-02-23 04:10:45,251 : INFO : PROGRESS: at sentence #50000, processed 23178940 words, keeping 762970 word types
2026-02-23 04:10:46,316 : INFO : PROGRESS: at sentence #60000, processed 27879473 words, keeping 863341 word types
2026-02-23 04:10:47,316 : INFO : PROGRESS: at sentence #70000, processed 32527610 words, keeping 955692 word types
2026-02-23 04:10:48,306 : I

Check top words in Vocab

In [1]:
from gensim.models import FastText

In [2]:
chosen_embeddings = FastText.load("embeddings/cc.bn.300.model")
#chosen_embeddings = FastText.load("embeddings/ai4b_subset_sg.model")
#chosen_embeddings = FastText.load("embeddings/ai4b_subset_fair.model")

In [3]:
def optimal_count_given_threshold(model:FastText, threshold=0.95):
    wv = model.wv
    counts = [wv.get_vecattr(w, 'count') for w in wv.index_to_key]
    # wv.index_to_key is sorted be default (https://github.com/piskvorky/gensim/blob/develop/gensim/models/word2vec.py)
    
    if sum(counts) == len(counts):
        print("No frequency data in model.")
        return None
    
    total = sum(counts)
    cumsum, n = 0, 0
    for c in counts:
        cumsum += c
        n += 1
        if cumsum >= total * threshold:
            return n

In [4]:
optimal_count_given_threshold(chosen_embeddings, threshold=0.95)

142061

Thus, we can keep around 150k words to capture most of the context in the largest embedding vocab. 

In [5]:
del chosen_embeddings # free up RAM